From patchwork Fri Mar 15 10:38:35 2019
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Jan Beulich <JBeulich@suse.com>
X-Patchwork-Id: 10854453
Return-Path: <xen-devel-bounces@lists.xenproject.org>
Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org
 [172.30.200.125])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id E77961575
	for <patchwork-xen-devel@patchwork.kernel.org>;
 Fri, 15 Mar 2019 10:40:25 +0000 (UTC)
Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1])
	by mail.wl.linuxfoundation.org (Postfix) with ESMTP id CBE812A934
	for <patchwork-xen-devel@patchwork.kernel.org>;
 Fri, 15 Mar 2019 10:40:25 +0000 (UTC)
Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486)
	id BFC832A937; Fri, 15 Mar 2019 10:40:25 +0000 (UTC)
X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on
	pdx-wl-mail.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-5.2 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI,
	RCVD_IN_DNSWL_MED autolearn=ham version=3.3.1
Received: from lists.xenproject.org (lists.xenproject.org [192.237.175.120])
	(using TLSv1.2 with cipher AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by mail.wl.linuxfoundation.org (Postfix) with ESMTPS id D7BCD2A934
	for <patchwork-xen-devel@patchwork.kernel.org>;
 Fri, 15 Mar 2019 10:40:24 +0000 (UTC)
Received: from localhost ([127.0.0.1] helo=lists.xenproject.org)
	by lists.xenproject.org with esmtp (Exim 4.89)
	(envelope-from <xen-devel-bounces@lists.xenproject.org>)
	id 1h4kEb-0003EJ-I9; Fri, 15 Mar 2019 10:38:41 +0000
Received: from all-amaz-eas1.inumbo.com ([34.197.232.57]
 helo=us1-amaz-eas2.inumbo.com)
 by lists.xenproject.org with esmtp (Exim 4.89)
 (envelope-from
 <SRS0=64UD=RS=suse.com=jbeulich@srs-us1.protection.inumbo.net>)
 id 1h4kEa-0003E9-1H
 for xen-devel@lists.xenproject.org; Fri, 15 Mar 2019 10:38:40 +0000
X-Inumbo-ID: 7dfca9dc-470e-11e9-8d48-b7475793dce7
Received: from prv1-mh.provo.novell.com (unknown [137.65.248.33])
 by us1-amaz-eas2.inumbo.com (Halon) with ESMTPS
 id 7dfca9dc-470e-11e9-8d48-b7475793dce7;
 Fri, 15 Mar 2019 10:38:38 +0000 (UTC)
Received: from INET-PRV1-MTA by prv1-mh.provo.novell.com
 with Novell_GroupWise; Fri, 15 Mar 2019 04:38:37 -0600
Message-Id: <5C8B80AB020000780021F11F@prv1-mh.provo.novell.com>
X-Mailer: Novell GroupWise Internet Agent 18.1.0 
Date: Fri, 15 Mar 2019 04:38:35 -0600
From: "Jan Beulich" <JBeulich@suse.com>
To: "xen-devel" <xen-devel@lists.xenproject.org>
References: <5B6BF83602000078001DC548@prv1-mh.provo.novell.com>
 <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com>
In-Reply-To: <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com>
Mime-Version: 1.0
Content-Disposition: inline
Subject: [Xen-devel] [PATCH v8 04/50] x86emul: basic AVX512F testing
X-BeenThere: xen-devel@lists.xenproject.org
X-Mailman-Version: 2.1.23
Precedence: list
List-Id: Xen developer discussion <xen-devel.lists.xenproject.org>
List-Unsubscribe: <https://lists.xenproject.org/mailman/options/xen-devel>,
 <mailto:xen-devel-request@lists.xenproject.org?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xenproject.org>
List-Help: <mailto:xen-devel-request@lists.xenproject.org?subject=help>
List-Subscribe: <https://lists.xenproject.org/mailman/listinfo/xen-devel>,
 <mailto:xen-devel-request@lists.xenproject.org?subject=subscribe>
Cc: George Dunlap <George.Dunlap@eu.citrix.com>,
 Andrew Cooper <andrew.cooper3@citrix.com>, Wei Liu <wei.liu2@citrix.com>,
 Roger Pau Monne <roger.pau@citrix.com>
Errors-To: xen-devel-bounces@lists.xenproject.org
Sender: "Xen-devel" <xen-devel-bounces@lists.xenproject.org>
X-Virus-Scanned: ClamAV using ClamSMTP

Test various of the insns which have been implemented already.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v6: Fix formatting in simd.h.
v5: Add VSQRT* tests.
v4: Make eq() also work for 4- and 8-byte integer element sizes.
v3: New.

--- a/tools/tests/x86_emulator/Makefile
+++ b/tools/tests/x86_emulator/Makefile
@@ -16,7 +16,7 @@ vpath %.c $(XEN_ROOT)/xen/lib/x86
 
 CFLAGS += $(CFLAGS_xeninclude)
 
-SIMD := 3dnow sse sse2 sse4 avx avx2 xop
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f
 FMA := fma4 fma
 SG := avx2-sg
 TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -63,6 +63,9 @@ avx2-sg-flts := 4 8
 xop-vecs := $(avx-vecs)
 xop-ints := 1 2 4 8
 xop-flts := $(avx-flts)
+avx512f-vecs := 64
+avx512f-ints := 4 8
+avx512f-flts := 4 8
 
 avx512f-opmask-vecs := 2
 avx512dq-opmask-vecs := 1
@@ -170,7 +173,7 @@ $(addsuffix .c,$(SG)):
 
 $(addsuffix .h,$(SIMD) $(FMA) $(SG)): simd.h
 
-xop.h: simd-fma.c
+xop.h avx512f.h: simd-fma.c
 
 endif # 32-bit override
 
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -2,7 +2,41 @@
 
 ENTRY(simd_test);
 
-#if VEC_SIZE == 8 && defined(__SSE__)
+#if defined(__AVX512F__)
+# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT))
+# if VEC_SIZE == 4
+#  define eq(x, y) ({ \
+    float x_ = (x)[0]; \
+    float __attribute__((vector_size(16))) y_ = { (y)[0] }; \
+    unsigned short r_; \
+    asm ( "vcmpss $0, %1, %2, %0"  : "=k" (r_) : "m" (x_), "v" (y_) ); \
+    r_ == 1; \
+})
+# elif VEC_SIZE == 8
+#  define eq(x, y) ({ \
+    double x_ = (x)[0]; \
+    double __attribute__((vector_size(16))) y_ = { (y)[0] }; \
+    unsigned short r_; \
+    asm ( "vcmpsd $0, %1, %2, %0"  : "=k" (r_) : "m" (x_), "v" (y_) ); \
+    r_ == 1; \
+})
+# elif FLOAT_SIZE == 4
+/*
+ * gcc's (up to at least 8.2) __builtin_ia32_cmpps256_mask() has an anomaly in
+ * that its return type is QI rather than UQI, and hence the value would get
+ * sign-extended before comapring to ALL_TRUE. The same oddity does not matter
+ * for __builtin_ia32_cmppd256_mask(), as there only 4 bits are significant.
+ * Hence the extra " & ALL_TRUE".
+ */
+#  define eq(x, y) ((BR(cmpps, _mask, x, y, 0, -1) & ALL_TRUE) == ALL_TRUE)
+# elif FLOAT_SIZE == 8
+#  define eq(x, y) (BR(cmppd, _mask, x, y, 0, -1) == ALL_TRUE)
+# elif INT_SIZE == 4 || UINT_SIZE == 4
+#  define eq(x, y) (B(pcmpeqd, _mask, (vsi_t)(x), (vsi_t)(y), -1) == ALL_TRUE)
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+#  define eq(x, y) (B(pcmpeqq, _mask, (vdi_t)(x), (vdi_t)(y), -1) == ALL_TRUE)
+# endif
+#elif VEC_SIZE == 8 && defined(__SSE__)
 # define to_bool(cmp) (__builtin_ia32_pmovmskb(cmp) == 0xff)
 #elif VEC_SIZE == 16
 # if defined(__AVX__) && defined(FLOAT_SIZE)
@@ -93,6 +127,56 @@ static inline bool _to_bool(byte_vec_t b
     touch(x); \
     __builtin_ia32_pfrcpit2(__builtin_ia32_pfrsqit1(__builtin_ia32_pfmul(t_, t_), x), t_); \
 })
+#elif defined(FLOAT_SIZE) && VEC_SIZE == FLOAT_SIZE && defined(__AVX512F__)
+# if FLOAT_SIZE == 4
+#  define sqrt(x) scalar_1op(x, "vsqrtss %[in], %[out], %[out]")
+# elif FLOAT_SIZE == 8
+#  define sqrt(x) scalar_1op(x, "vsqrtsd %[in], %[out], %[out]")
+# endif
+#elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
+      (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if FLOAT_SIZE == 4
+#  define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vbroadcastss %1, %0" \
+          : "=v" (t_) : "m" (*(float[1]){ x }) ); \
+    t_; \
+})
+#  define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
+#  define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
+#  define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
+#  define sqrt(x) BR(sqrtps, _mask, x, undef(), ~0)
+#  if VEC_SIZE == 16
+#   define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0)
+#   define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
+#   define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
+#  endif
+# elif FLOAT_SIZE == 8
+#  if VEC_SIZE >= 32
+#   define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vbroadcastsd %1, %0" : "=v" (t_) \
+          : "m" (*(double[1]){ x }) ); \
+    t_; \
+})
+#  else
+#   define broadcast(x) ({ \
+    vec_t t_; \
+    asm ( "%{evex%} vpbroadcastq %1, %0" \
+          : "=v" (t_) : "m" (*(double[1]){ x }) ); \
+    t_; \
+})
+#  endif
+#  define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0)
+#  define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
+#  define mix(x, y) B(movapd, _mask, x, y, 0b01010101)
+#  define sqrt(x) BR(sqrtpd, _mask, x, undef(), ~0)
+#  if VEC_SIZE == 16
+#   define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0)
+#   define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0)
+#   define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0)
+#  endif
+# endif
 #elif FLOAT_SIZE == 4 && defined(__SSE__)
 # if VEC_SIZE == 32 && defined(__AVX__)
 #  if defined(__AVX2__)
@@ -191,7 +275,30 @@ static inline bool _to_bool(byte_vec_t b
 #  define sqrt(x) scalar_1op(x, "sqrtsd %[in], %[out]")
 # endif
 #endif
-#if VEC_SIZE == 16 && defined(__SSE2__)
+#if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \
+     defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__))
+# if INT_SIZE == 4 || UINT_SIZE == 4
+#  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
+                              (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
+# elif INT_SIZE == 8 || UINT_SIZE == 8
+#  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
+# endif
+# if INT_SIZE == 4
+#  define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0)
+#  define min(x, y) B(pminsd, _mask, x, y, undef(), ~0)
+#  define mul_full(x, y) ((vec_t)B(pmuldq, _mask, x, y, (vdi_t)undef(), ~0))
+# elif UINT_SIZE == 4
+#  define max(x, y) ((vec_t)B(pmaxud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminud, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#  define mul_full(x, y) ((vec_t)B(pmuludq, _mask, (vsi_t)(x), (vsi_t)(y), (vdi_t)undef(), ~0))
+# elif INT_SIZE == 8
+#  define max(x, y) ((vec_t)B(pmaxsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminsq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+# elif UINT_SIZE == 8
+#  define max(x, y) ((vec_t)B(pmaxuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#  define min(x, y) ((vec_t)B(pminuq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+# endif
+#elif VEC_SIZE == 16 && defined(__SSE2__)
 # if INT_SIZE == 1 || UINT_SIZE == 1
 #  define interleave_hi(x, y) ((vec_t)__builtin_ia32_punpckhbw128((vqi_t)(x), (vqi_t)(y)))
 #  define interleave_lo(x, y) ((vec_t)__builtin_ia32_punpcklbw128((vqi_t)(x), (vqi_t)(y)))
@@ -587,6 +694,10 @@ static inline bool _to_bool(byte_vec_t b
 # endif
 #endif
 
+#if defined(__AVX512F__) && defined(FLOAT_SIZE)
+# include "simd-fma.c"
+#endif
+
 int simd_test(void)
 {
     unsigned int i, j;
@@ -1034,7 +1145,8 @@ int simd_test(void)
 # endif
 #endif
 
-#if defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)
+#if (defined(__XOP__) && VEC_SIZE == 16 && (INT_SIZE == 2 || INT_SIZE == 4)) || \
+    (defined(__AVX512F__) && defined(FLOAT_SIZE))
     return -fma_test();
 #endif
 
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -70,9 +70,111 @@ typedef int __attribute__((vector_size(V
 typedef long long __attribute__((vector_size(VEC_SIZE))) vdi_t;
 #endif
 
+#if VEC_SIZE == 16
+# define B(n, s, a...)   __builtin_ia32_ ## n ## 128 ## s(a)
+# define B_(n, s, a...)  __builtin_ia32_ ## n ##        s(a)
+#elif VEC_SIZE == 32
+# define B(n, s, a...)   __builtin_ia32_ ## n ## 256 ## s(a)
+#elif VEC_SIZE == 64
+# define B(n, s, a...)   __builtin_ia32_ ## n ## 512 ## s(a)
+# define BR(n, s, a...)  __builtin_ia32_ ## n ## 512 ## s(a, 4)
+#endif
+#ifndef B_
+# define B_ B
+#endif
+#ifndef BR
+# define BR B
+# define BR_ B_
+#endif
+#ifndef BR_
+# define BR_ BR
+#endif
+
+#ifdef __AVX512F__
+
+/*
+ * The original plan was to effect use of EVEX encodings for scalar as well as
+ * 128- and 256-bit insn variants by restricting the compiler to use (on 64-bit
+ * only of course) XMM16-XMM31 only. All sorts of compiler errors result when
+ * doing this with gcc 8.2. Therefore resort to injecting {evex} prefixes,
+ * which has the benefit of also working for 32-bit. Granted, there is a lot of
+ * escaping to get right here.
+ */
+asm ( ".macro override insn    \n\t"
+      ".macro $\\insn o:vararg \n\t"
+      ".purgem \\insn          \n\t"
+      "{evex} \\insn \\(\\)o   \n\t"
+      ".macro \\insn o:vararg  \n\t"
+      "$\\insn \\(\\(\\))o     \n\t"
+      ".endm                   \n\t"
+      ".endm                   \n\t"
+      ".macro \\insn o:vararg  \n\t"
+      "$\\insn \\(\\)o         \n\t"
+      ".endm                   \n\t"
+      ".endm" );
+
+# define OVR(n) asm ( "override v" #n )
+# define OVR_SFP(n) OVR(n ## sd); OVR(n ## ss)
+
+# ifdef __AVX512VL__
+#  ifdef __AVX512BW__
+#   define OVR_BW(n) OVR(p ## n ## b); OVR(p ## n ## w)
+#  else
+#   define OVR_BW(n)
+#  endif
+#  define OVR_DQ(n) OVR(p ## n ## d); OVR(p ## n ## q)
+#  define OVR_VFP(n) OVR(n ## pd); OVR(n ## ps)
+# else
+#  define OVR_BW(n)
+#  define OVR_DQ(n)
+#  define OVR_VFP(n)
+# endif
+
+# define OVR_FMA(n, w) OVR_ ## w(n ## 132); OVR_ ## w(n ## 213); \
+                       OVR_ ## w(n ## 231)
+# define OVR_FP(n) OVR_VFP(n); OVR_SFP(n)
+# define OVR_INT(n) OVR_BW(n); OVR_DQ(n)
+
+OVR_SFP(broadcast);
+OVR_SFP(comi);
+OVR_FP(add);
+OVR_FP(div);
+OVR(extractps);
+OVR_FMA(fmadd, FP);
+OVR_FMA(fmsub, FP);
+OVR_FMA(fnmadd, FP);
+OVR_FMA(fnmsub, FP);
+OVR(insertps);
+OVR_FP(max);
+OVR_FP(min);
+OVR(movd);
+OVR(movq);
+OVR_SFP(mov);
+OVR_FP(mul);
+OVR_FP(sqrt);
+OVR_FP(sub);
+OVR_SFP(ucomi);
+
+# undef OVR_VFP
+# undef OVR_SFP
+# undef OVR_INT
+# undef OVR_FP
+# undef OVR_FMA
+# undef OVR_DQ
+# undef OVR_BW
+# undef OVR
+
+#endif /* __AVX512F__ */
+
 /*
  * Suppress value propagation by the compiler, preventing unwanted
  * optimization. This at once makes the compiler use memory operands
  * more often, which for our purposes is the more interesting case.
  */
 #define touch(var) asm volatile ( "" : "+m" (var) )
+
+static inline vec_t undef(void)
+{
+    vec_t v = v;
+    return v;
+}
--- a/tools/tests/x86_emulator/simd-fma.c
+++ b/tools/tests/x86_emulator/simd-fma.c
@@ -1,10 +1,9 @@
+#if !defined(__XOP__) && !defined(__AVX512F__)
 #include "simd.h"
-
-#ifndef __XOP__
 ENTRY(fma_test);
 #endif
 
-#if VEC_SIZE < 16
+#if VEC_SIZE < 16 && !defined(to_bool)
 # define to_bool(cmp) (!~(cmp)[0])
 #elif VEC_SIZE == 16
 # if FLOAT_SIZE == 4
@@ -24,7 +23,13 @@ ENTRY(fma_test);
 # define eq(x, y) to_bool((x) == (y))
 #endif
 
-#if VEC_SIZE == 16
+#if defined(__AVX512F__) && VEC_SIZE > FLOAT_SIZE
+# if FLOAT_SIZE == 4
+#  define fmaddsub(x, y, z) BR(vfmaddsubps, _mask, x, y, z, ~0)
+# elif FLOAT_SIZE == 8
+#  define fmaddsub(x, y, z) BR(vfmaddsubpd, _mask, x, y, z, ~0)
+# endif
+#elif VEC_SIZE == 16
 # if FLOAT_SIZE == 4
 #  define addsub(x, y) __builtin_ia32_addsubps(x, y)
 #  if defined(__FMA4__) || defined(__FMA__)
@@ -50,6 +55,10 @@ ENTRY(fma_test);
 # endif
 #endif
 
+#if defined(fmaddsub) && !defined(addsub)
+# define addsub(x, y) fmaddsub(x, broadcast(1), y)
+#endif
+
 int fma_test(void)
 {
     unsigned int i;
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -21,6 +21,7 @@ asm ( ".pushsection .test, \"ax\", @prog
 #include "avx512f-opmask.h"
 #include "avx512dq-opmask.h"
 #include "avx512bw-opmask.h"
+#include "avx512f.h"
 
 #define verbose false /* Switch to true for far more logging. */
 
@@ -248,6 +249,14 @@ static const struct {
     SIMD(OPMASK/b,    avx512dq_opmask,         1),
     SIMD(OPMASK/d,    avx512bw_opmask,         4),
     SIMD(OPMASK/q,    avx512bw_opmask,         8),
+    SIMD(AVX512F f32 scalar,  avx512f,        f4),
+    SIMD(AVX512F f32x16,      avx512f,      64f4),
+    SIMD(AVX512F f64 scalar,  avx512f,        f8),
+    SIMD(AVX512F f64x8,       avx512f,      64f8),
+    SIMD(AVX512F s32x16,      avx512f,      64i4),
+    SIMD(AVX512F u32x16,      avx512f,      64u4),
+    SIMD(AVX512F s64x8,       avx512f,      64i8),
+    SIMD(AVX512F u64x8,       avx512f,      64u8),
 #undef SIMD_
 #undef SIMD
 };