From patchwork Fri Mar 15 10:44:23 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Jan Beulich X-Patchwork-Id: 10854473 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 3BDD713B5 for ; Fri, 15 Mar 2019 10:46:28 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 1F4832A934 for ; Fri, 15 Mar 2019 10:46:28 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 132502A938; Fri, 15 Mar 2019 10:46:28 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-5.2 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_MED autolearn=ham version=3.3.1 Received: from lists.xenproject.org (lists.xenproject.org [192.237.175.120]) (using TLSv1.2 with cipher AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.wl.linuxfoundation.org (Postfix) with ESMTPS id 0362C2A934 for ; Fri, 15 Mar 2019 10:46:27 +0000 (UTC) Received: from localhost ([127.0.0.1] helo=lists.xenproject.org) by lists.xenproject.org with esmtp (Exim 4.89) (envelope-from ) id 1h4kKB-0004tU-Mo; Fri, 15 Mar 2019 10:44:27 +0000 Received: from us1-rack-dfw2.inumbo.com ([104.130.134.6]) by lists.xenproject.org with esmtp (Exim 4.89) (envelope-from ) id 1h4kKA-0004t5-DN for xen-devel@lists.xenproject.org; Fri, 15 Mar 2019 10:44:26 +0000 X-Inumbo-ID: 4c4c4af7-470f-11e9-bc90-bc764e045a96 Received: from prv1-mh.provo.novell.com (unknown [137.65.248.33]) by us1-rack-dfw2.inumbo.com (Halon) with ESMTPS id 4c4c4af7-470f-11e9-bc90-bc764e045a96; Fri, 15 Mar 2019 10:44:24 +0000 (UTC) Received: from INET-PRV1-MTA by prv1-mh.provo.novell.com with Novell_GroupWise; Fri, 15 Mar 2019 04:44:23 -0600 Message-Id: <5C8B8207020000780021F173@prv1-mh.provo.novell.com> X-Mailer: Novell GroupWise Internet Agent 18.1.0 Date: Fri, 15 Mar 2019 04:44:23 -0600 From: "Jan Beulich" To: "xen-devel" References: <5B6BF83602000078001DC548@prv1-mh.provo.novell.com> <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> In-Reply-To: <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> Mime-Version: 1.0 Content-Disposition: inline Subject: [Xen-devel] [PATCH v8 14/50] x86emul: basic AVX512DQ testing X-BeenThere: xen-devel@lists.xenproject.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: Xen developer discussion List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Cc: George Dunlap , Andrew Cooper , Wei Liu , Roger Pau Monne Errors-To: xen-devel-bounces@lists.xenproject.org Sender: "Xen-devel" X-Virus-Scanned: ClamAV using ClamSMTP Test various of the insns which have been implemented already. Signed-off-by: Jan Beulich Acked-by: Andrew Cooper --- v6: Re-base. v5: Re-base over changes earlier in the series. v4: Wrap OVR(pmullq) in __AVX512VL__ conditional. v3: New. --- a/tools/tests/x86_emulator/Makefile +++ b/tools/tests/x86_emulator/Makefile @@ -16,7 +16,7 @@ vpath %.c $(XEN_ROOT)/xen/lib/x86 CFLAGS += $(CFLAGS_xeninclude) -SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw +SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq FMA := fma4 fma SG := avx2-sg TESTCASES := blowfish $(SIMD) $(FMA) $(SG) @@ -69,9 +69,12 @@ avx512f-flts := 4 8 avx512bw-vecs := $(avx512f-vecs) avx512bw-ints := 1 2 avx512bw-flts := +avx512dq-vecs := $(avx512f-vecs) +avx512dq-ints := $(avx512f-ints) +avx512dq-flts := $(avx512f-flts) avx512f-opmask-vecs := 2 -avx512dq-opmask-vecs := 1 +avx512dq-opmask-vecs := 1 2 avx512bw-opmask-vecs := 4 8 # Suppress building by default of the harness if the compiler can't deal --- a/tools/tests/x86_emulator/simd.h +++ b/tools/tests/x86_emulator/simd.h @@ -121,6 +121,34 @@ typedef int __attribute__((vector_size(E typedef long long __attribute__((vector_size(EIGHTH_SIZE))) vdi_eighth_t; # endif +# define DECL_PAIR(w) \ +typedef w ## _t pair_t; \ +typedef vsi_ ## w ## _t vsi_pair_t; \ +typedef vdi_ ## w ## _t vdi_pair_t +# define DECL_QUARTET(w) \ +typedef w ## _t quartet_t; \ +typedef vsi_ ## w ## _t vsi_quartet_t; \ +typedef vdi_ ## w ## _t vdi_quartet_t +# define DECL_OCTET(w) \ +typedef w ## _t octet_t; \ +typedef vsi_ ## w ## _t vsi_octet_t; \ +typedef vdi_ ## w ## _t vdi_octet_t + +# if ELEM_COUNT == 4 +DECL_PAIR(half); +# elif ELEM_COUNT == 8 +DECL_PAIR(quarter); +DECL_QUARTET(half); +# elif ELEM_COUNT == 16 +DECL_PAIR(eighth); +DECL_QUARTET(quarter); +DECL_OCTET(half); +# endif + +# undef DECL_OCTET +# undef DECL_QUARTET +# undef DECL_PAIR + #endif #if VEC_SIZE == 16 @@ -146,6 +174,14 @@ typedef long long __attribute__((vector_ #ifdef __AVX512F__ /* Sadly there are a few exceptions to the general naming rules. */ +# define __builtin_ia32_broadcastf32x4_512_mask __builtin_ia32_broadcastf32x4_512 +# define __builtin_ia32_broadcasti32x4_512_mask __builtin_ia32_broadcasti32x4_512 +# define __builtin_ia32_insertf32x4_512_mask __builtin_ia32_insertf32x4_mask +# define __builtin_ia32_insertf32x8_512_mask __builtin_ia32_insertf32x8_mask +# define __builtin_ia32_insertf64x4_512_mask __builtin_ia32_insertf64x4_mask +# define __builtin_ia32_inserti32x4_512_mask __builtin_ia32_inserti32x4_mask +# define __builtin_ia32_inserti32x8_512_mask __builtin_ia32_inserti32x8_mask +# define __builtin_ia32_inserti64x4_512_mask __builtin_ia32_inserti64x4_mask # define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask # define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask # define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask @@ -331,6 +367,20 @@ OVR(punpcklwd); # endif # endif +# ifdef __AVX512DQ__ +OVR_VFP(and); +OVR_VFP(andn); +OVR_VFP(or); +OVR(pextrd); +OVR(pextrq); +OVR(pinsrd); +OVR(pinsrq); +# ifdef __AVX512VL__ +OVR(pmullq); +# endif +OVR_VFP(xor); +# endif + # undef OVR_VFP # undef OVR_SFP # undef OVR_INT --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -139,6 +139,27 @@ static inline bool _to_bool(byte_vec_t b # endif #elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \ (VEC_SIZE == 64 || defined(__AVX512VL__)) +# if ELEM_COUNT == 8 /* vextractf{32,64}x4 */ || \ + (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextractf32x8 */ || \ + (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */ +# define low_half(x) ({ \ + half_t t_; \ + asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \ + : [d] "=m" (t_) \ + : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \ + t_; \ +}) +# endif +# if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextractf32x4 */ || \ + (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */ +# define low_quarter(x) ({ \ + quarter_t t_; \ + asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \ + : [d] "=m" (t_) \ + : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \ + t_; \ +}) +# endif # if FLOAT_SIZE == 4 # define broadcast(x) ({ \ vec_t t_; \ @@ -146,6 +167,17 @@ static inline bool _to_bool(byte_vec_t b : "=v" (t_) : "m" (*(float[1]){ x }) ); \ t_; \ }) +# if VEC_SIZE >= 32 && defined(__AVX512DQ__) +# define broadcast_pair(x) ({ \ + vec_t t_; \ + asm ( "vbroadcastf32x2 %1, %0" : "=v" (t_) : "m" (x) ); \ + t_; \ +}) +# endif +# if VEC_SIZE == 64 && defined(__AVX512DQ__) +# define broadcast_octet(x) B(broadcastf32x8_, _mask, x, undef(), ~0) +# define insert_octet(x, y, p) B(insertf32x8_, _mask, x, y, p, undef(), ~0) +# endif # define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0) # define min(x, y) BR_(minps, _mask, x, y, undef(), ~0) # define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE)) @@ -155,6 +187,13 @@ static inline bool _to_bool(byte_vec_t b # define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0) # define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0) # else +# define broadcast_quartet(x) B(broadcastf32x4_, _mask, x, undef(), ~0) +# define insert_pair(x, y, p) \ + B(insertf32x4_, _mask, x, \ + /* Cast needed below to work around gcc 7.x quirk. */ \ + (p) & 1 ? (typeof(y))__builtin_ia32_shufps(y, y, 0b01000100) : (y), \ + (p) >> 1, x, 3 << ((p) * 2)) +# define insert_quartet(x, y, p) B(insertf32x4_, _mask, x, y, p, undef(), ~0) # define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0) # define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0) # define swap(x) ({ \ @@ -178,6 +217,14 @@ static inline bool _to_bool(byte_vec_t b t_; \ }) # endif +# if VEC_SIZE >= 32 && defined(__AVX512DQ__) +# define broadcast_pair(x) B(broadcastf64x2_, _mask, x, undef(), ~0) +# define insert_pair(x, y, p) B(insertf64x2_, _mask, x, y, p, undef(), ~0) +# endif +# if VEC_SIZE == 64 +# define broadcast_quartet(x) B(broadcastf64x4_, , x, undef(), ~0) +# define insert_quartet(x, y, p) B(insertf64x4_, _mask, x, y, p, undef(), ~0) +# endif # define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0) # define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0) # define mix(x, y) B(movapd, _mask, x, y, 0b01010101) @@ -306,6 +353,16 @@ static inline bool _to_bool(byte_vec_t b t_; \ }) # endif +# if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextracti32x4 */ || \ + (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextracti64x2 */ +# define low_quarter(x) ({ \ + quarter_t t_; \ + asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \ + : [d] "=m" (t_) \ + : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \ + t_; \ +}) +# endif # if INT_SIZE == 4 || UINT_SIZE == 4 # define broadcast(x) ({ \ vec_t t_; \ @@ -318,11 +375,30 @@ static inline bool _to_bool(byte_vec_t b asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \ t_; \ }) +# ifdef __AVX512DQ__ +# define broadcast_pair(x) ({ \ + vec_t t_; \ + asm ( "vbroadcasti32x2 %1, %0" : "=v" (t_) : "m" (x) ); \ + t_; \ +}) +# endif +# if VEC_SIZE == 64 && defined(__AVX512DQ__) +# define broadcast_octet(x) ((vec_t)B(broadcasti32x8_, _mask, (vsi_octet_t)(x), (vsi_t)undef(), ~0)) +# define insert_octet(x, y, p) ((vec_t)B(inserti32x8_, _mask, (vsi_t)(x), (vsi_octet_t)(y), p, (vsi_t)undef(), ~0)) +# endif # if VEC_SIZE == 16 # define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0)) # define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0)) # define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b00011011, (vsi_t)undef(), ~0)) # else +# define broadcast_quartet(x) ((vec_t)B(broadcasti32x4_, _mask, (vsi_quartet_t)(x), (vsi_t)undef(), ~0)) +# define insert_pair(x, y, p) \ + (vec_t)(B(inserti32x4_, _mask, (vsi_t)(x), \ + /* First cast needed below to work around gcc 7.x quirk. */ \ + (p) & 1 ? (vsi_pair_t)__builtin_ia32_pshufd((vsi_pair_t)(y), 0b01000100) \ + : (vsi_pair_t)(y), \ + (p) >> 1, (vsi_t)(x), 3 << ((p) * 2))) +# define insert_quartet(x, y, p) ((vec_t)B(inserti32x4_, _mask, (vsi_t)(x), (vsi_quartet_t)(y), p, (vsi_t)undef(), ~0)) # define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0)) # define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0)) # define swap(x) ((vec_t)B(pshufd, _mask, \ @@ -347,6 +423,14 @@ static inline bool _to_bool(byte_vec_t b t_; \ }) # endif +# if VEC_SIZE >= 32 && defined(__AVX512DQ__) +# define broadcast_pair(x) ((vec_t)B(broadcasti64x2_, _mask, (vdi_pair_t)(x), (vdi_t)undef(), ~0)) +# define insert_pair(x, y, p) ((vec_t)B(inserti64x2_, _mask, (vdi_t)(x), (vdi_pair_t)(y), p, (vdi_t)undef(), ~0)) +# endif +# if VEC_SIZE == 64 +# define broadcast_quartet(x) ((vec_t)B(broadcasti64x4_, , (vdi_quartet_t)(x), (vdi_t)undef(), ~0)) +# define insert_quartet(x, y, p) ((vec_t)B(inserti64x4_, _mask, (vdi_t)(x), (vdi_quartet_t)(y), p, (vdi_t)undef(), ~0)) +# endif # if VEC_SIZE == 16 # define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0)) # define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0)) @@ -898,7 +982,7 @@ static inline eighth_t low_eighth(vec_t eighth_t y; unsigned int i; - for ( i = 0; i < ELEM_COUNT / 4; ++i ) + for ( i = 0; i < ELEM_COUNT / 8; ++i ) y[i] = x[i]; return y; @@ -910,6 +994,50 @@ static inline eighth_t low_eighth(vec_t #endif +#ifdef broadcast_pair +# if ELEM_COUNT == 4 +# define broadcast_half broadcast_pair +# elif ELEM_COUNT == 8 +# define broadcast_quarter broadcast_pair +# elif ELEM_COUNT == 16 +# define broadcast_eighth broadcast_pair +# endif +#endif + +#ifdef insert_pair +# if ELEM_COUNT == 4 +# define insert_half insert_pair +# elif ELEM_COUNT == 8 +# define insert_quarter insert_pair +# elif ELEM_COUNT == 16 +# define insert_eighth insert_pair +# endif +#endif + +#ifdef broadcast_quartet +# if ELEM_COUNT == 8 +# define broadcast_half broadcast_quartet +# elif ELEM_COUNT == 16 +# define broadcast_quarter broadcast_quartet +# endif +#endif + +#ifdef insert_quartet +# if ELEM_COUNT == 8 +# define insert_half insert_quartet +# elif ELEM_COUNT == 16 +# define insert_quarter insert_quartet +# endif +#endif + +#if defined(broadcast_octet) && ELEM_COUNT == 16 +# define broadcast_half broadcast_octet +#endif + +#if defined(insert_octet) && ELEM_COUNT == 16 +# define insert_half insert_octet +#endif + #if defined(__AVX512F__) && defined(FLOAT_SIZE) # include "simd-fma.c" #endif @@ -1205,6 +1333,60 @@ int simd_test(void) if ( !eq(broadcast2(ELEM_COUNT + 1), src + inv) ) return __LINE__; #endif +#if defined(broadcast_half) && defined(insert_half) + { + half_t aux = low_half(src); + + touch(aux); + x = broadcast_half(aux); + touch(aux); + y = insert_half(src, aux, 1); + if ( !eq(x, y) ) return __LINE__; + } +#endif + +#if defined(broadcast_quarter) && defined(insert_quarter) + { + quarter_t aux = low_quarter(src); + + touch(aux); + x = broadcast_quarter(aux); + touch(aux); + y = insert_quarter(src, aux, 1); + touch(aux); + y = insert_quarter(y, aux, 2); + touch(aux); + y = insert_quarter(y, aux, 3); + if ( !eq(x, y) ) return __LINE__; + } +#endif + +#if defined(broadcast_eighth) && defined(insert_eighth) && \ + /* At least gcc 7.3 "optimizes" away all insert_eighth() calls below. */ \ + __GNUC__ >= 8 + { + eighth_t aux = low_eighth(src); + + touch(aux); + x = broadcast_eighth(aux); + touch(aux); + y = insert_eighth(src, aux, 1); + touch(aux); + y = insert_eighth(y, aux, 2); + touch(aux); + y = insert_eighth(y, aux, 3); + touch(aux); + y = insert_eighth(y, aux, 4); + touch(aux); + y = insert_eighth(y, aux, 5); + touch(aux); + y = insert_eighth(y, aux, 6); + touch(aux); + y = insert_eighth(y, aux, 7); + if ( !eq(x, y) ) return __LINE__; + } +#endif + #if defined(interleave_lo) && defined(interleave_hi) touch(src); x = interleave_lo(inv, src); --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -23,6 +23,7 @@ asm ( ".pushsection .test, \"ax\", @prog #include "avx512bw-opmask.h" #include "avx512f.h" #include "avx512bw.h" +#include "avx512dq.h" #define verbose false /* Switch to true for far more logging. */ @@ -100,6 +101,11 @@ static bool simd_check_avx512dq(void) } #define simd_check_avx512dq_opmask simd_check_avx512dq +static bool simd_check_avx512dq_vl(void) +{ + return cpu_has_avx512dq && cpu_has_avx512vl; +} + static bool simd_check_avx512bw(void) { return cpu_has_avx512bw; @@ -267,9 +273,10 @@ static const struct { SIMD(XOP i32x8, xop, 32i4), SIMD(XOP i64x4, xop, 32i8), SIMD(OPMASK/w, avx512f_opmask, 2), - SIMD(OPMASK/b, avx512dq_opmask, 1), - SIMD(OPMASK/d, avx512bw_opmask, 4), - SIMD(OPMASK/q, avx512bw_opmask, 8), + SIMD(OPMASK+DQ/b, avx512dq_opmask, 1), + SIMD(OPMASK+DQ/w, avx512dq_opmask, 2), + SIMD(OPMASK+BW/d, avx512bw_opmask, 4), + SIMD(OPMASK+BW/q, avx512bw_opmask, 8), SIMD(AVX512F f32 scalar, avx512f, f4), SIMD(AVX512F f32x16, avx512f, 64f4), SIMD(AVX512F f64 scalar, avx512f, f8), @@ -302,6 +309,24 @@ static const struct { AVX512VL(BW+VL u16x8, avx512bw, 16u2), AVX512VL(BW+VL s16x16, avx512bw, 32i2), AVX512VL(BW+VL u16x16, avx512bw, 32u2), + SIMD(AVX512DQ f32x16, avx512dq, 64f4), + SIMD(AVX512DQ f64x8, avx512dq, 64f8), + SIMD(AVX512DQ s32x16, avx512dq, 64i4), + SIMD(AVX512DQ u32x16, avx512dq, 64u4), + SIMD(AVX512DQ s64x8, avx512dq, 64i8), + SIMD(AVX512DQ u64x8, avx512dq, 64u8), + AVX512VL(DQ+VL f32x4, avx512dq, 16f4), + AVX512VL(DQ+VL f64x2, avx512dq, 16f8), + AVX512VL(DQ+VL f32x8, avx512dq, 32f4), + AVX512VL(DQ+VL f64x4, avx512dq, 32f8), + AVX512VL(DQ+VL s32x4, avx512dq, 16i4), + AVX512VL(DQ+VL u32x4, avx512dq, 16u4), + AVX512VL(DQ+VL s32x8, avx512dq, 32i4), + AVX512VL(DQ+VL u32x8, avx512dq, 32u4), + AVX512VL(DQ+VL s64x2, avx512dq, 16i8), + AVX512VL(DQ+VL u64x2, avx512dq, 16u8), + AVX512VL(DQ+VL s64x4, avx512dq, 32i8), + AVX512VL(DQ+VL u64x4, avx512dq, 32u8), #undef AVX512VL_ #undef AVX512VL #undef SIMD_