@@ -63,7 +63,7 @@ avx2-sg-flts := 4 8
xop-vecs := $(avx-vecs)
xop-ints := 1 2 4 8
xop-flts := $(avx-flts)
-avx512f-vecs := 64
+avx512f-vecs := 64 16 32
avx512f-ints := 4 8
avx512f-flts := 4 8
@@ -5,13 +5,13 @@ ENTRY(fma_test);
#if VEC_SIZE < 16 && !defined(to_bool)
# define to_bool(cmp) (!~(cmp)[0])
-#elif VEC_SIZE == 16
+#elif VEC_SIZE == 16 && !defined(__AVX512VL__)
# if FLOAT_SIZE == 4
# define to_bool(cmp) __builtin_ia32_vtestcps(cmp, (vec_t){} == 0)
# elif FLOAT_SIZE == 8
# define to_bool(cmp) __builtin_ia32_vtestcpd(cmp, (vec_t){} == 0)
# endif
-#elif VEC_SIZE == 32
+#elif VEC_SIZE == 32 && !defined(__AVX512VL__)
# if FLOAT_SIZE == 4
# define to_bool(cmp) __builtin_ia32_vtestcps256(cmp, (vec_t){} == 0)
# elif FLOAT_SIZE == 8
@@ -539,7 +539,7 @@ static inline bool _to_bool(byte_vec_t b
# define rotr(x, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(x), (n) * 64))
# endif
#endif
-#if VEC_SIZE == 16 && defined(__SSE4_1__)
+#if VEC_SIZE == 16 && defined(__SSE4_1__) && !defined(__AVX512VL__)
# if INT_SIZE == 1
# define max(x, y) ((vec_t)__builtin_ia32_pmaxsb128((vqi_t)(x), (vqi_t)(y)))
# define min(x, y) ((vec_t)__builtin_ia32_pminsb128((vqi_t)(x), (vqi_t)(y)))
@@ -593,7 +593,7 @@ static inline bool _to_bool(byte_vec_t b
# define mix(x, y) __builtin_ia32_blendpd(x, y, 0b10)
# endif
#endif
-#if VEC_SIZE == 32 && defined(__AVX__)
+#if VEC_SIZE == 32 && defined(__AVX__) && !defined(__AVX512VL__)
# if FLOAT_SIZE == 4
# define dot_product(x, y) ({ \
vec_t t_ = __builtin_ia32_dpps256(x, y, 0b11110001); \
@@ -92,6 +92,15 @@ typedef long long __attribute__((vector_
#ifdef __AVX512F__
+# if VEC_SIZE > ELEM_SIZE && (defined(VEC_MAX) ? VEC_MAX : VEC_SIZE) < 64
+# pragma GCC target ( "avx512vl" )
+# endif
+
+# define REN(insn, old, new) \
+ asm ( ".macro v" #insn #old " o:vararg \n\t" \
+ "v" #insn #new " \\o \n\t" \
+ ".endm" )
+
/*
* The original plan was to effect use of EVEX encodings for scalar as well as
* 128- and 256-bit insn variants by restricting the compiler to use (on 64-bit
@@ -135,25 +144,88 @@ asm ( ".macro override insn \n\t"
# define OVR_FP(n) OVR_VFP(n); OVR_SFP(n)
# define OVR_INT(n) OVR_BW(n); OVR_DQ(n)
+OVR_INT(broadcast);
OVR_SFP(broadcast);
OVR_SFP(comi);
OVR_FP(add);
+OVR_INT(add);
OVR_FP(div);
OVR(extractps);
OVR_FMA(fmadd, FP);
+OVR_FMA(fmaddsub, VFP);
OVR_FMA(fmsub, FP);
+OVR_FMA(fmsubadd, VFP);
OVR_FMA(fnmadd, FP);
OVR_FMA(fnmsub, FP);
OVR(insertps);
OVR_FP(max);
+OVR_INT(maxs);
+OVR_INT(maxu);
OVR_FP(min);
+OVR_INT(mins);
+OVR_INT(minu);
OVR(movd);
OVR(movq);
OVR_SFP(mov);
+OVR_VFP(mova);
+OVR_VFP(movnt);
+OVR_VFP(movu);
OVR_FP(mul);
+OVR_VFP(shuf);
+OVR_INT(sll);
+OVR_DQ(sllv);
OVR_FP(sqrt);
+OVR_INT(sra);
+OVR_DQ(srav);
+OVR_INT(srl);
+OVR_DQ(srlv);
OVR_FP(sub);
+OVR_INT(sub);
OVR_SFP(ucomi);
+OVR_VFP(unpckh);
+OVR_VFP(unpckl);
+
+# ifdef __AVX512VL__
+# if ELEM_SIZE == 8 && defined(__AVX512DQ__)
+REN(extract, f128, f64x2);
+REN(extract, i128, i64x2);
+REN(insert, f128, f64x2);
+REN(insert, i128, i64x2);
+# else
+REN(extract, f128, f32x4);
+REN(extract, i128, i32x4);
+REN(insert, f128, f32x4);
+REN(insert, i128, i32x4);
+# endif
+# if ELEM_SIZE == 8
+REN(movdqa, , 64);
+REN(movdqu, , 64);
+REN(pand, , q);
+REN(pandn, , q);
+REN(por, , q);
+REN(pxor, , q);
+# else
+# if ELEM_SIZE == 1 && defined(__AVX512BW__)
+REN(movdq, a, u8);
+REN(movdqu, , 8);
+# elif ELEM_SIZE == 2 && defined(__AVX512BW__)
+REN(movdq, a, u16);
+REN(movdqu, , 16);
+# else
+REN(movdqa, , 32);
+REN(movdqu, , 32);
+# endif
+REN(pand, , d);
+REN(pandn, , d);
+REN(por, , d);
+REN(pxor, , d);
+# endif
+OVR(movntdq);
+OVR(movntdqa);
+OVR(pmulld);
+OVR(pmuldq);
+OVR(pmuludq);
+# endif
# undef OVR_VFP
# undef OVR_SFP
@@ -88,6 +88,11 @@ static bool simd_check_avx512f(void)
}
#define simd_check_avx512f_opmask simd_check_avx512f
+static bool simd_check_avx512f_vl(void)
+{
+ return cpu_has_avx512f && cpu_has_avx512vl;
+}
+
static bool simd_check_avx512dq(void)
{
return cpu_has_avx512dq;
@@ -142,11 +147,21 @@ static const struct {
.check_cpu = simd_check_ ## feat, \
.set_regs = simd_set_regs, \
.check_regs = simd_check_regs }
+#define AVX512VL_(bits, desc, feat, form) \
+ { .code = feat ## _x86_ ## bits ## _D ## _ ## form, \
+ .size = sizeof(feat ## _x86_ ## bits ## _D ## _ ## form), \
+ .bitness = bits, .name = "AVX512" #desc, \
+ .check_cpu = simd_check_ ## feat ## _vl, \
+ .set_regs = simd_set_regs, \
+ .check_regs = simd_check_regs }
#ifdef __x86_64__
# define SIMD(desc, feat, form) SIMD_(64, desc, feat, form), \
SIMD_(32, desc, feat, form)
+# define AVX512VL(desc, feat, form) AVX512VL_(64, desc, feat, form), \
+ AVX512VL_(32, desc, feat, form)
#else
# define SIMD(desc, feat, form) SIMD_(32, desc, feat, form)
+# define AVX512VL(desc, feat, form) AVX512VL_(32, desc, feat, form)
#endif
SIMD(3DNow! single, _3dnow, 8f4),
SIMD(SSE scalar single, sse, f4),
@@ -257,6 +272,20 @@ static const struct {
SIMD(AVX512F u32x16, avx512f, 64u4),
SIMD(AVX512F s64x8, avx512f, 64i8),
SIMD(AVX512F u64x8, avx512f, 64u8),
+ AVX512VL(VL f32x4, avx512f, 16f4),
+ AVX512VL(VL f64x2, avx512f, 16f8),
+ AVX512VL(VL f32x8, avx512f, 32f4),
+ AVX512VL(VL f64x4, avx512f, 32f8),
+ AVX512VL(VL s32x4, avx512f, 16i4),
+ AVX512VL(VL u32x4, avx512f, 16u4),
+ AVX512VL(VL s32x8, avx512f, 32i4),
+ AVX512VL(VL u32x8, avx512f, 32u4),
+ AVX512VL(VL s64x2, avx512f, 16i8),
+ AVX512VL(VL u64x2, avx512f, 16u8),
+ AVX512VL(VL s64x4, avx512f, 32i8),
+ AVX512VL(VL u64x4, avx512f, 32u8),
+#undef AVX512VL_
+#undef AVX512VL
#undef SIMD_
#undef SIMD
};