@@ -16,7 +16,7 @@ vpath %.c $(XEN_ROOT)/xen/lib/x86
CFLAGS += $(CFLAGS_xeninclude)
-SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw
+SIMD := 3dnow sse sse2 sse4 avx avx2 xop avx512f avx512bw avx512dq
FMA := fma4 fma
SG := avx2-sg
TESTCASES := blowfish $(SIMD) $(FMA) $(SG)
@@ -69,9 +69,12 @@ avx512f-flts := 4 8
avx512bw-vecs := $(avx512f-vecs)
avx512bw-ints := 1 2
avx512bw-flts :=
+avx512dq-vecs := $(avx512f-vecs)
+avx512dq-ints := $(avx512f-ints)
+avx512dq-flts := $(avx512f-flts)
avx512f-opmask-vecs := 2
-avx512dq-opmask-vecs := 1
+avx512dq-opmask-vecs := 1 2
avx512bw-opmask-vecs := 4 8
# Suppress building by default of the harness if the compiler can't deal
@@ -121,6 +121,34 @@ typedef int __attribute__((vector_size(E
typedef long long __attribute__((vector_size(EIGHTH_SIZE))) vdi_eighth_t;
# endif
+# define DECL_PAIR(w) \
+typedef w ## _t pair_t; \
+typedef vsi_ ## w ## _t vsi_pair_t; \
+typedef vdi_ ## w ## _t vdi_pair_t
+# define DECL_QUARTET(w) \
+typedef w ## _t quartet_t; \
+typedef vsi_ ## w ## _t vsi_quartet_t; \
+typedef vdi_ ## w ## _t vdi_quartet_t
+# define DECL_OCTET(w) \
+typedef w ## _t octet_t; \
+typedef vsi_ ## w ## _t vsi_octet_t; \
+typedef vdi_ ## w ## _t vdi_octet_t
+
+# if ELEM_COUNT == 4
+DECL_PAIR(half);
+# elif ELEM_COUNT == 8
+DECL_PAIR(quarter);
+DECL_QUARTET(half);
+# elif ELEM_COUNT == 16
+DECL_PAIR(eighth);
+DECL_QUARTET(quarter);
+DECL_OCTET(half);
+# endif
+
+# undef DECL_OCTET
+# undef DECL_QUARTET
+# undef DECL_PAIR
+
#endif
#if VEC_SIZE == 16
@@ -146,6 +174,14 @@ typedef long long __attribute__((vector_
#ifdef __AVX512F__
/* Sadly there are a few exceptions to the general naming rules. */
+# define __builtin_ia32_broadcastf32x4_512_mask __builtin_ia32_broadcastf32x4_512
+# define __builtin_ia32_broadcasti32x4_512_mask __builtin_ia32_broadcasti32x4_512
+# define __builtin_ia32_insertf32x4_512_mask __builtin_ia32_insertf32x4_mask
+# define __builtin_ia32_insertf32x8_512_mask __builtin_ia32_insertf32x8_mask
+# define __builtin_ia32_insertf64x4_512_mask __builtin_ia32_insertf64x4_mask
+# define __builtin_ia32_inserti32x4_512_mask __builtin_ia32_inserti32x4_mask
+# define __builtin_ia32_inserti32x8_512_mask __builtin_ia32_inserti32x8_mask
+# define __builtin_ia32_inserti64x4_512_mask __builtin_ia32_inserti64x4_mask
# define __builtin_ia32_shuf_f32x4_512_mask __builtin_ia32_shuf_f32x4_mask
# define __builtin_ia32_shuf_f64x2_512_mask __builtin_ia32_shuf_f64x2_mask
# define __builtin_ia32_shuf_i32x4_512_mask __builtin_ia32_shuf_i32x4_mask
@@ -331,6 +367,20 @@ OVR(punpcklwd);
# endif
# endif
+# ifdef __AVX512DQ__
+OVR_VFP(and);
+OVR_VFP(andn);
+OVR_VFP(or);
+OVR(pextrd);
+OVR(pextrq);
+OVR(pinsrd);
+OVR(pinsrq);
+# ifdef __AVX512VL__
+OVR(pmullq);
+# endif
+OVR_VFP(xor);
+# endif
+
# undef OVR_VFP
# undef OVR_SFP
# undef OVR_INT
@@ -139,6 +139,27 @@ static inline bool _to_bool(byte_vec_t b
# endif
#elif defined(FLOAT_SIZE) && defined(__AVX512F__) && \
(VEC_SIZE == 64 || defined(__AVX512VL__))
+# if ELEM_COUNT == 8 /* vextractf{32,64}x4 */ || \
+ (ELEM_COUNT == 16 && ELEM_SIZE == 4 && defined(__AVX512DQ__)) /* vextractf32x8 */ || \
+ (ELEM_COUNT == 4 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
+# define low_half(x) ({ \
+ half_t t_; \
+ asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \
+ : [d] "=m" (t_) \
+ : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 2) ); \
+ t_; \
+})
+# endif
+# if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextractf32x4 */ || \
+ (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextractf64x2 */
+# define low_quarter(x) ({ \
+ quarter_t t_; \
+ asm ( "vextractf%c[w]x%c[n] $0, %[s], %[d]" \
+ : [d] "=m" (t_) \
+ : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \
+ t_; \
+})
+# endif
# if FLOAT_SIZE == 4
# define broadcast(x) ({ \
vec_t t_; \
@@ -146,6 +167,17 @@ static inline bool _to_bool(byte_vec_t b
: "=v" (t_) : "m" (*(float[1]){ x }) ); \
t_; \
})
+# if VEC_SIZE >= 32 && defined(__AVX512DQ__)
+# define broadcast_pair(x) ({ \
+ vec_t t_; \
+ asm ( "vbroadcastf32x2 %1, %0" : "=v" (t_) : "m" (x) ); \
+ t_; \
+})
+# endif
+# if VEC_SIZE == 64 && defined(__AVX512DQ__)
+# define broadcast_octet(x) B(broadcastf32x8_, _mask, x, undef(), ~0)
+# define insert_octet(x, y, p) B(insertf32x8_, _mask, x, y, p, undef(), ~0)
+# endif
# define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
# define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
# define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
@@ -155,6 +187,13 @@ static inline bool _to_bool(byte_vec_t b
# define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0)
# define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0)
# else
+# define broadcast_quartet(x) B(broadcastf32x4_, _mask, x, undef(), ~0)
+# define insert_pair(x, y, p) \
+ B(insertf32x4_, _mask, x, \
+ /* Cast needed below to work around gcc 7.x quirk. */ \
+ (p) & 1 ? (typeof(y))__builtin_ia32_shufps(y, y, 0b01000100) : (y), \
+ (p) >> 1, x, 3 << ((p) * 2))
+# define insert_quartet(x, y, p) B(insertf32x4_, _mask, x, y, p, undef(), ~0)
# define interleave_hi(x, y) B(vpermi2varps, _mask, x, interleave_hi, y, ~0)
# define interleave_lo(x, y) B(vpermt2varps, _mask, interleave_lo, x, y, ~0)
# define swap(x) ({ \
@@ -178,6 +217,14 @@ static inline bool _to_bool(byte_vec_t b
t_; \
})
# endif
+# if VEC_SIZE >= 32 && defined(__AVX512DQ__)
+# define broadcast_pair(x) B(broadcastf64x2_, _mask, x, undef(), ~0)
+# define insert_pair(x, y, p) B(insertf64x2_, _mask, x, y, p, undef(), ~0)
+# endif
+# if VEC_SIZE == 64
+# define broadcast_quartet(x) B(broadcastf64x4_, , x, undef(), ~0)
+# define insert_quartet(x, y, p) B(insertf64x4_, _mask, x, y, p, undef(), ~0)
+# endif
# define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0)
# define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
# define mix(x, y) B(movapd, _mask, x, y, 0b01010101)
@@ -306,6 +353,16 @@ static inline bool _to_bool(byte_vec_t b
t_; \
})
# endif
+# if (ELEM_COUNT == 16 && ELEM_SIZE == 4) /* vextracti32x4 */ || \
+ (ELEM_COUNT == 8 && ELEM_SIZE == 8 && defined(__AVX512DQ__)) /* vextracti64x2 */
+# define low_quarter(x) ({ \
+ quarter_t t_; \
+ asm ( "vextracti%c[w]x%c[n] $0, %[s], %[d]" \
+ : [d] "=m" (t_) \
+ : [s] "v" (x), [w] "i" (ELEM_SIZE * 8), [n] "i" (ELEM_COUNT / 4) ); \
+ t_; \
+})
+# endif
# if INT_SIZE == 4 || UINT_SIZE == 4
# define broadcast(x) ({ \
vec_t t_; \
@@ -318,11 +375,30 @@ static inline bool _to_bool(byte_vec_t b
asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \
t_; \
})
+# ifdef __AVX512DQ__
+# define broadcast_pair(x) ({ \
+ vec_t t_; \
+ asm ( "vbroadcasti32x2 %1, %0" : "=v" (t_) : "m" (x) ); \
+ t_; \
+})
+# endif
+# if VEC_SIZE == 64 && defined(__AVX512DQ__)
+# define broadcast_octet(x) ((vec_t)B(broadcasti32x8_, _mask, (vsi_octet_t)(x), (vsi_t)undef(), ~0))
+# define insert_octet(x, y, p) ((vec_t)B(inserti32x8_, _mask, (vsi_t)(x), (vsi_octet_t)(y), p, (vsi_t)undef(), ~0))
+# endif
# if VEC_SIZE == 16
# define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
# define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
# define swap(x) ((vec_t)B(pshufd, _mask, (vsi_t)(x), 0b00011011, (vsi_t)undef(), ~0))
# else
+# define broadcast_quartet(x) ((vec_t)B(broadcasti32x4_, _mask, (vsi_quartet_t)(x), (vsi_t)undef(), ~0))
+# define insert_pair(x, y, p) \
+ (vec_t)(B(inserti32x4_, _mask, (vsi_t)(x), \
+ /* First cast needed below to work around gcc 7.x quirk. */ \
+ (p) & 1 ? (vsi_pair_t)__builtin_ia32_pshufd((vsi_pair_t)(y), 0b01000100) \
+ : (vsi_pair_t)(y), \
+ (p) >> 1, (vsi_t)(x), 3 << ((p) * 2)))
+# define insert_quartet(x, y, p) ((vec_t)B(inserti32x4_, _mask, (vsi_t)(x), (vsi_quartet_t)(y), p, (vsi_t)undef(), ~0))
# define interleave_hi(x, y) ((vec_t)B(vpermi2vard, _mask, (vsi_t)(x), interleave_hi, (vsi_t)(y), ~0))
# define interleave_lo(x, y) ((vec_t)B(vpermt2vard, _mask, interleave_lo, (vsi_t)(x), (vsi_t)(y), ~0))
# define swap(x) ((vec_t)B(pshufd, _mask, \
@@ -347,6 +423,14 @@ static inline bool _to_bool(byte_vec_t b
t_; \
})
# endif
+# if VEC_SIZE >= 32 && defined(__AVX512DQ__)
+# define broadcast_pair(x) ((vec_t)B(broadcasti64x2_, _mask, (vdi_pair_t)(x), (vdi_t)undef(), ~0))
+# define insert_pair(x, y, p) ((vec_t)B(inserti64x2_, _mask, (vdi_t)(x), (vdi_pair_t)(y), p, (vdi_t)undef(), ~0))
+# endif
+# if VEC_SIZE == 64
+# define broadcast_quartet(x) ((vec_t)B(broadcasti64x4_, , (vdi_quartet_t)(x), (vdi_t)undef(), ~0))
+# define insert_quartet(x, y, p) ((vec_t)B(inserti64x4_, _mask, (vdi_t)(x), (vdi_quartet_t)(y), p, (vdi_t)undef(), ~0))
+# endif
# if VEC_SIZE == 16
# define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
# define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
@@ -898,7 +982,7 @@ static inline eighth_t low_eighth(vec_t
eighth_t y;
unsigned int i;
- for ( i = 0; i < ELEM_COUNT / 4; ++i )
+ for ( i = 0; i < ELEM_COUNT / 8; ++i )
y[i] = x[i];
return y;
@@ -910,6 +994,50 @@ static inline eighth_t low_eighth(vec_t
#endif
+#ifdef broadcast_pair
+# if ELEM_COUNT == 4
+# define broadcast_half broadcast_pair
+# elif ELEM_COUNT == 8
+# define broadcast_quarter broadcast_pair
+# elif ELEM_COUNT == 16
+# define broadcast_eighth broadcast_pair
+# endif
+#endif
+
+#ifdef insert_pair
+# if ELEM_COUNT == 4
+# define insert_half insert_pair
+# elif ELEM_COUNT == 8
+# define insert_quarter insert_pair
+# elif ELEM_COUNT == 16
+# define insert_eighth insert_pair
+# endif
+#endif
+
+#ifdef broadcast_quartet
+# if ELEM_COUNT == 8
+# define broadcast_half broadcast_quartet
+# elif ELEM_COUNT == 16
+# define broadcast_quarter broadcast_quartet
+# endif
+#endif
+
+#ifdef insert_quartet
+# if ELEM_COUNT == 8
+# define insert_half insert_quartet
+# elif ELEM_COUNT == 16
+# define insert_quarter insert_quartet
+# endif
+#endif
+
+#if defined(broadcast_octet) && ELEM_COUNT == 16
+# define broadcast_half broadcast_octet
+#endif
+
+#if defined(insert_octet) && ELEM_COUNT == 16
+# define insert_half insert_octet
+#endif
+
#if defined(__AVX512F__) && defined(FLOAT_SIZE)
# include "simd-fma.c"
#endif
@@ -1205,6 +1333,60 @@ int simd_test(void)
if ( !eq(broadcast2(ELEM_COUNT + 1), src + inv) ) return __LINE__;
#endif
+#if defined(broadcast_half) && defined(insert_half)
+ {
+ half_t aux = low_half(src);
+
+ touch(aux);
+ x = broadcast_half(aux);
+ touch(aux);
+ y = insert_half(src, aux, 1);
+ if ( !eq(x, y) ) return __LINE__;
+ }
+#endif
+
+#if defined(broadcast_quarter) && defined(insert_quarter)
+ {
+ quarter_t aux = low_quarter(src);
+
+ touch(aux);
+ x = broadcast_quarter(aux);
+ touch(aux);
+ y = insert_quarter(src, aux, 1);
+ touch(aux);
+ y = insert_quarter(y, aux, 2);
+ touch(aux);
+ y = insert_quarter(y, aux, 3);
+ if ( !eq(x, y) ) return __LINE__;
+ }
+#endif
+
+#if defined(broadcast_eighth) && defined(insert_eighth) && \
+ /* At least gcc 7.3 "optimizes" away all insert_eighth() calls below. */ \
+ __GNUC__ >= 8
+ {
+ eighth_t aux = low_eighth(src);
+
+ touch(aux);
+ x = broadcast_eighth(aux);
+ touch(aux);
+ y = insert_eighth(src, aux, 1);
+ touch(aux);
+ y = insert_eighth(y, aux, 2);
+ touch(aux);
+ y = insert_eighth(y, aux, 3);
+ touch(aux);
+ y = insert_eighth(y, aux, 4);
+ touch(aux);
+ y = insert_eighth(y, aux, 5);
+ touch(aux);
+ y = insert_eighth(y, aux, 6);
+ touch(aux);
+ y = insert_eighth(y, aux, 7);
+ if ( !eq(x, y) ) return __LINE__;
+ }
+#endif
+
#if defined(interleave_lo) && defined(interleave_hi)
touch(src);
x = interleave_lo(inv, src);
@@ -23,6 +23,7 @@ asm ( ".pushsection .test, \"ax\", @prog
#include "avx512bw-opmask.h"
#include "avx512f.h"
#include "avx512bw.h"
+#include "avx512dq.h"
#define verbose false /* Switch to true for far more logging. */
@@ -100,6 +101,11 @@ static bool simd_check_avx512dq(void)
}
#define simd_check_avx512dq_opmask simd_check_avx512dq
+static bool simd_check_avx512dq_vl(void)
+{
+ return cpu_has_avx512dq && cpu_has_avx512vl;
+}
+
static bool simd_check_avx512bw(void)
{
return cpu_has_avx512bw;
@@ -267,9 +273,10 @@ static const struct {
SIMD(XOP i32x8, xop, 32i4),
SIMD(XOP i64x4, xop, 32i8),
SIMD(OPMASK/w, avx512f_opmask, 2),
- SIMD(OPMASK/b, avx512dq_opmask, 1),
- SIMD(OPMASK/d, avx512bw_opmask, 4),
- SIMD(OPMASK/q, avx512bw_opmask, 8),
+ SIMD(OPMASK+DQ/b, avx512dq_opmask, 1),
+ SIMD(OPMASK+DQ/w, avx512dq_opmask, 2),
+ SIMD(OPMASK+BW/d, avx512bw_opmask, 4),
+ SIMD(OPMASK+BW/q, avx512bw_opmask, 8),
SIMD(AVX512F f32 scalar, avx512f, f4),
SIMD(AVX512F f32x16, avx512f, 64f4),
SIMD(AVX512F f64 scalar, avx512f, f8),
@@ -302,6 +309,24 @@ static const struct {
AVX512VL(BW+VL u16x8, avx512bw, 16u2),
AVX512VL(BW+VL s16x16, avx512bw, 32i2),
AVX512VL(BW+VL u16x16, avx512bw, 32u2),
+ SIMD(AVX512DQ f32x16, avx512dq, 64f4),
+ SIMD(AVX512DQ f64x8, avx512dq, 64f8),
+ SIMD(AVX512DQ s32x16, avx512dq, 64i4),
+ SIMD(AVX512DQ u32x16, avx512dq, 64u4),
+ SIMD(AVX512DQ s64x8, avx512dq, 64i8),
+ SIMD(AVX512DQ u64x8, avx512dq, 64u8),
+ AVX512VL(DQ+VL f32x4, avx512dq, 16f4),
+ AVX512VL(DQ+VL f64x2, avx512dq, 16f8),
+ AVX512VL(DQ+VL f32x8, avx512dq, 32f4),
+ AVX512VL(DQ+VL f64x4, avx512dq, 32f8),
+ AVX512VL(DQ+VL s32x4, avx512dq, 16i4),
+ AVX512VL(DQ+VL u32x4, avx512dq, 16u4),
+ AVX512VL(DQ+VL s32x8, avx512dq, 32i4),
+ AVX512VL(DQ+VL u32x8, avx512dq, 32u4),
+ AVX512VL(DQ+VL s64x2, avx512dq, 16i8),
+ AVX512VL(DQ+VL u64x2, avx512dq, 16u8),
+ AVX512VL(DQ+VL s64x4, avx512dq, 32i8),
+ AVX512VL(DQ+VL u64x4, avx512dq, 32u8),
#undef AVX512VL_
#undef AVX512VL
#undef SIMD_
Test various of the insns which have been implemented already. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v6: Re-base. v5: Re-base over changes earlier in the series. v4: Wrap OVR(pmullq) in __AVX512VL__ conditional. v3: New.