@@ -12,7 +12,7 @@ run: $(TARGET)
./$(TARGET)
SIMD := sse sse2 sse4 avx
-FMA := fma4
+FMA := fma4 fma
TESTCASES := blowfish $(SIMD) sse2-avx sse4-avx $(FMA)
blowfish-cflags := ""
@@ -33,6 +33,9 @@ avx-flts := 4 8
fma4-vecs := $(avx-vecs)
fma4-ints :=
fma4-flts := $(avx-flts)
+fma-vecs := $(avx-vecs)
+fma-ints :=
+fma-flts := $(avx-flts)
# When converting SSE to AVX, have the compiler avoid XMM0 to widen
# coverage of the VEX.vvvv checks in the emulator. We must not do this,
@@ -21,24 +21,24 @@ ENTRY(fma_test);
#if VEC_SIZE == 16
# if FLOAT_SIZE == 4
# define addsub(x, y) __builtin_ia32_addsubps(x, y)
-# if defined(__FMA4__)
+# if defined(__FMA4__) || defined(__FMA__)
# define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps(x, y, z)
# endif
# elif FLOAT_SIZE == 8
# define addsub(x, y) __builtin_ia32_addsubpd(x, y)
-# if defined(__FMA4__)
+# if defined(__FMA4__) || defined(__FMA__)
# define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd(x, y, z)
# endif
# endif
#elif VEC_SIZE == 32
# if FLOAT_SIZE == 4
# define addsub(x, y) __builtin_ia32_addsubps256(x, y)
-# if defined(__FMA4__)
+# if defined(__FMA4__) || defined(__FMA__)
# define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubps256(x, y, z)
# endif
# elif FLOAT_SIZE == 8
# define addsub(x, y) __builtin_ia32_addsubpd256(x, y)
-# if defined(__FMA4__)
+# if defined(__FMA4__) || defined(__FMA__)
# define fmaddsub(x, y, z) __builtin_ia32_vfmaddsubpd256(x, y, z)
# endif
# endif
@@ -12,6 +12,7 @@
#include "sse4-avx.h"
#include "avx.h"
#include "fma4.h"
+#include "fma.h"
#define verbose false /* Switch to true for far more logging. */
@@ -53,6 +54,11 @@ static bool simd_check_fma4(void)
return cpu_has_fma4;
}
+static bool simd_check_fma(void)
+{
+ return cpu_has_fma;
+}
+
static void simd_set_regs(struct cpu_user_regs *regs)
{
if ( cpu_has_mmx )
@@ -155,6 +161,12 @@ static const struct {
SIMD(FMA4 scalar double, fma4, f8),
SIMD(FMA4 128bit double, fma4, 16f8),
SIMD(FMA4 256bit double, fma4, 32f8),
+ SIMD(FMA scalar single, fma, f4),
+ SIMD(FMA 128bit single, fma, 16f4),
+ SIMD(FMA 256bit single, fma, 32f4),
+ SIMD(FMA scalar double, fma, f8),
+ SIMD(FMA 128bit double, fma, 16f8),
+ SIMD(FMA 256bit double, fma, 32f8),
#undef SIMD_
#undef SIMD
};
@@ -94,6 +94,14 @@ static inline uint64_t xgetbv(uint32_t x
(res.c & (1U << 0)) != 0; \
})
+#define cpu_has_fma ({ \
+ struct cpuid_leaf res; \
+ emul_test_cpuid(1, 0, &res, NULL); \
+ if ( !(res.c & (1U << 27)) || ((xgetbv(0) & 6) != 6) ) \
+ res.c = 0; \
+ (res.c & (1U << 12)) != 0; \
+})
+
#define cpu_has_sse4_1 ({ \
struct cpuid_leaf res; \
emul_test_cpuid(1, 0, &res, NULL); \
@@ -385,6 +385,9 @@ static const struct {
[0x37 ... 0x3f] = { .simd_size = simd_packed_int },
[0x40] = { .simd_size = simd_packed_int },
[0x41] = { .simd_size = simd_packed_int, .two_op = 1 },
+ [0x96 ... 0x9f] = { .simd_size = simd_packed_fp },
+ [0xa6 ... 0xaf] = { .simd_size = simd_packed_fp },
+ [0xb6 ... 0xbf] = { .simd_size = simd_packed_fp },
[0xc8 ... 0xcd] = { .simd_size = simd_other },
[0xdb] = { .simd_size = simd_packed_int, .two_op = 1 },
[0xdc ... 0xdf] = { .simd_size = simd_packed_int },
@@ -1605,6 +1608,7 @@ static bool vcpu_has(
#define vcpu_has_sse3() vcpu_has( 1, ECX, 0, ctxt, ops)
#define vcpu_has_pclmulqdq() vcpu_has( 1, ECX, 1, ctxt, ops)
#define vcpu_has_ssse3() vcpu_has( 1, ECX, 9, ctxt, ops)
+#define vcpu_has_fma() vcpu_has( 1, ECX, 12, ctxt, ops)
#define vcpu_has_cx16() vcpu_has( 1, ECX, 13, ctxt, ops)
#define vcpu_has_sse4_1() vcpu_has( 1, ECX, 19, ctxt, ops)
#define vcpu_has_sse4_2() vcpu_has( 1, ECX, 20, ctxt, ops)
@@ -7352,6 +7356,39 @@ x86_emulate(
generate_exception_if(vex.l, EXC_UD);
goto simd_0f_avx;
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x96): /* vfmaddsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x97): /* vfmsubadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x98): /* vfmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x99): /* vfmadd132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x9a): /* vfmsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x9b): /* vfmsub132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x9c): /* vfnmadd132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x9d): /* vfnmadd132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x9e): /* vfnmsub132p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0x9f): /* vfnmsub132s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xa6): /* vfmaddsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xa7): /* vfmsubadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xa8): /* vfmadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xa9): /* vfmadd213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xaa): /* vfmsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xab): /* vfmsub213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xac): /* vfnmadd213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xad): /* vfnmadd213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xae): /* vfnmsub213p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xaf): /* vfnmsub213s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xb6): /* vfmaddsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xb7): /* vfmsubadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xb8): /* vfmadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xb9): /* vfmadd231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xba): /* vfmsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xbb): /* vfmsub231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xbc): /* vfnmadd231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xbd): /* vfnmadd231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xbe): /* vfnmsub231p{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ case X86EMUL_OPC_VEX_66(0x0f38, 0xbf): /* vfnmsub231s{s,d} {x,y}mm/mem,{x,y}mm,{x,y}mm */
+ host_and_vcpu_must_have(fma);
+ goto simd_0f_ymm;
+
case X86EMUL_OPC(0x0f38, 0xc8): /* sha1nexte xmm/m128,xmm */
case X86EMUL_OPC(0x0f38, 0xc9): /* sha1msg1 xmm/m128,xmm */
case X86EMUL_OPC(0x0f38, 0xca): /* sha1msg2 xmm/m128,xmm */
@@ -50,6 +50,7 @@
#define cpu_has_vmx boot_cpu_has(X86_FEATURE_VMX)
#define cpu_has_eist boot_cpu_has(X86_FEATURE_EIST)
#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3)
+#define cpu_has_fma boot_cpu_has(X86_FEATURE_FMA)
#define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16)
#define cpu_has_pdcm boot_cpu_has(X86_FEATURE_PDCM)
#define cpu_has_pcid boot_cpu_has(X86_FEATURE_PCID)