diff mbox series

[3/3] tests/tcg/s390x: test signed vfmin/vfmax

Message ID 20220712015717.3602602-4-iii@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series target/s390x: vfmin/vfmax fixes | expand

Commit Message

Ilya Leoshkevich July 12, 2022, 1:57 a.m. UTC
Add a test to prevent regressions. Try all floating point value sizes
and all combinations of floating point value classes. Verify the results
against PoP tables, which are represented as close to the original as
possible - this produces a lot of checkpatch complaints, but it seems
to be justified in this case.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
---
 tests/tcg/s390x/Makefile.target |   7 +
 tests/tcg/s390x/vfminmax.c      | 426 ++++++++++++++++++++++++++++++++
 2 files changed, 433 insertions(+)
 create mode 100644 tests/tcg/s390x/vfminmax.c

Comments

Richard Henderson July 12, 2022, 4:49 a.m. UTC | #1
On 7/12/22 07:27, Ilya Leoshkevich wrote:
> +/*
> + * vfmin/vfmax code generation.
> + */
> +extern const char vfminmax_template[];
> +extern const int vfminmax_template_size;
> +extern const int vfminmax_offset;
> +asm(".globl vfminmax_template\n"
> +    "vfminmax_template:\n"
> +    "vl %v25,0(%r3)\n"
> +    "vl %v26,0(%r4)\n"
> +    "0: vfmax %v24,%v25,%v26,2,0,0\n"
> +    "vst %v24,0(%r2)\n"
> +    "br %r14\n"
> +    "1: .align 4\n"
> +    ".globl vfminmax_template_size\n"
> +    "vfminmax_template_size: .long 1b - vfminmax_template\n"
> +    ".globl vfminmax_offset\n"
> +    "vfminmax_offset: .long 0b - vfminmax_template\n");
...
> +
> +#define VFMIN 0xEE
> +#define VFMAX 0xEF
> +
> +static void vfminmax(unsigned char *buf, unsigned int op,
> +                     unsigned int m4, unsigned int m5, unsigned int m6,
> +                     void *v1, const void *v2, const void *v3)
> +{
> +    memcpy(buf, vfminmax_template, vfminmax_template_size);
> +    buf[vfminmax_offset + 3] = (m6 << 4) | m5;
> +    buf[vfminmax_offset + 4] &= 0x0F;
> +    buf[vfminmax_offset + 4] |= (m4 << 4);
> +    buf[vfminmax_offset + 5] = op;
> +    ((void (*)(void *, const void *, const void *))buf)(v1, v2, v3);
> +}

This works, of course.  It could be simpler using EXECUTE, to store just the one 
instruction and not worry about an executable mapped page, but I guess it doesn't matter.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>


r~
Ilya Leoshkevich July 12, 2022, 12:32 p.m. UTC | #2
On Tue, 2022-07-12 at 10:19 +0530, Richard Henderson wrote:
> On 7/12/22 07:27, Ilya Leoshkevich wrote:
> > +/*
> > + * vfmin/vfmax code generation.
> > + */
> > +extern const char vfminmax_template[];
> > +extern const int vfminmax_template_size;
> > +extern const int vfminmax_offset;
> > +asm(".globl vfminmax_template\n"
> > +    "vfminmax_template:\n"
> > +    "vl %v25,0(%r3)\n"
> > +    "vl %v26,0(%r4)\n"
> > +    "0: vfmax %v24,%v25,%v26,2,0,0\n"
> > +    "vst %v24,0(%r2)\n"
> > +    "br %r14\n"
> > +    "1: .align 4\n"
> > +    ".globl vfminmax_template_size\n"
> > +    "vfminmax_template_size: .long 1b - vfminmax_template\n"
> > +    ".globl vfminmax_offset\n"
> > +    "vfminmax_offset: .long 0b - vfminmax_template\n");
> ...
> > +
> > +#define VFMIN 0xEE
> > +#define VFMAX 0xEF
> > +
> > +static void vfminmax(unsigned char *buf, unsigned int op,
> > +                     unsigned int m4, unsigned int m5, unsigned
> > int m6,
> > +                     void *v1, const void *v2, const void *v3)
> > +{
> > +    memcpy(buf, vfminmax_template, vfminmax_template_size);
> > +    buf[vfminmax_offset + 3] = (m6 << 4) | m5;
> > +    buf[vfminmax_offset + 4] &= 0x0F;
> > +    buf[vfminmax_offset + 4] |= (m4 << 4);
> > +    buf[vfminmax_offset + 5] = op;
> > +    ((void (*)(void *, const void *, const void *))buf)(v1, v2,
> > v3);
> > +}
> 
> This works, of course.  It could be simpler using EXECUTE, to store
> just the one 
> instruction and not worry about an executable mapped page, but I
> guess it doesn't matter.
> 
> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
> 
> 
> r~

Thanks!

I thought about this too, but EX/EXRL operate only on the second byte,
and I need to modify bytes 3-5 here.
Richard Henderson July 13, 2022, 3:44 p.m. UTC | #3
On 7/12/22 18:02, Ilya Leoshkevich wrote:
>> This works, of course.  It could be simpler using EXECUTE, to store
>> just the one
>> instruction and not worry about an executable mapped page, but I
>> guess it doesn't matter.
> 
> I thought about this too, but EX/EXRL operate only on the second byte,
> and I need to modify bytes 3-5 here.

I didn't mean modify the instruction via EX, but something like

   static char minmax[6] __attribute__((aligned(2)))
     = { xx, yy, zz, 0, 0, 0 };

   minmax[3] = m6 ...
   minmax[4] = ...
   minmax[5] = op;

   asm("vl %%v25,0(%1)\n"
       "vl %%v26,0(%2)\n"
       "ex 0,0(%3)\n"
       "vst %%v24,0(%0)"
       : : "a"(v1), "a"(v2), "a"(v3), "a"(minmax)
       : "memory", "v24", "v25", "v26);


r~
Ilya Leoshkevich July 13, 2022, 4:14 p.m. UTC | #4
On Wed, 2022-07-13 at 21:14 +0530, Richard Henderson wrote:
> On 7/12/22 18:02, Ilya Leoshkevich wrote:
> > > This works, of course.  It could be simpler using EXECUTE, to
> > > store
> > > just the one
> > > instruction and not worry about an executable mapped page, but I
> > > guess it doesn't matter.
> > 
> > I thought about this too, but EX/EXRL operate only on the second
> > byte,
> > and I need to modify bytes 3-5 here.
> 
> I didn't mean modify the instruction via EX, but something like
> 
>    static char minmax[6] __attribute__((aligned(2)))
>      = { xx, yy, zz, 0, 0, 0 };
> 
>    minmax[3] = m6 ...
>    minmax[4] = ...
>    minmax[5] = op;
> 
>    asm("vl %%v25,0(%1)\n"
>        "vl %%v26,0(%2)\n"
>        "ex 0,0(%3)\n"
>        "vst %%v24,0(%0)"
>        : : "a"(v1), "a"(v2), "a"(v3), "a"(minmax)
>        : "memory", "v24", "v25", "v26);
> 
> 
> r~

Nice trick!

This works in qemu, but not natively: EX target must be executable.
I'd still like to try to find a way to establish an rwx section, and
send a v2 with this improvement.

I guess we'll need to fix the access check discrepancy some day.
diff mbox series

Patch

diff --git a/tests/tcg/s390x/Makefile.target b/tests/tcg/s390x/Makefile.target
index 3124172736..1a7a4a2f59 100644
--- a/tests/tcg/s390x/Makefile.target
+++ b/tests/tcg/s390x/Makefile.target
@@ -17,6 +17,13 @@  TESTS+=trap
 TESTS+=signals-s390x
 TESTS+=branch-relative-long
 
+Z14_TESTS=vfminmax
+vfminmax: LDFLAGS+=-lm
+$(Z14_TESTS): CFLAGS+=-march=z14 -O2
+
+TESTS+=$(if $(shell $(CC) -march=z14 -S -o /dev/null -xc /dev/null \
+			 >/dev/null 2>&1 && echo OK),$(Z14_TESTS))
+
 VECTOR_TESTS=vxeh2_vs
 VECTOR_TESTS+=vxeh2_vcvt
 VECTOR_TESTS+=vxeh2_vlstr
diff --git a/tests/tcg/s390x/vfminmax.c b/tests/tcg/s390x/vfminmax.c
new file mode 100644
index 0000000000..daf58b6b33
--- /dev/null
+++ b/tests/tcg/s390x/vfminmax.c
@@ -0,0 +1,426 @@ 
+#define _GNU_SOURCE
+#include <fenv.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+
+/*
+ * vfmin/vfmax code generation.
+ */
+extern const char vfminmax_template[];
+extern const int vfminmax_template_size;
+extern const int vfminmax_offset;
+asm(".globl vfminmax_template\n"
+    "vfminmax_template:\n"
+    "vl %v25,0(%r3)\n"
+    "vl %v26,0(%r4)\n"
+    "0: vfmax %v24,%v25,%v26,2,0,0\n"
+    "vst %v24,0(%r2)\n"
+    "br %r14\n"
+    "1: .align 4\n"
+    ".globl vfminmax_template_size\n"
+    "vfminmax_template_size: .long 1b - vfminmax_template\n"
+    ".globl vfminmax_offset\n"
+    "vfminmax_offset: .long 0b - vfminmax_template\n");
+
+#define VFMIN 0xEE
+#define VFMAX 0xEF
+
+static void vfminmax(unsigned char *buf, unsigned int op,
+                     unsigned int m4, unsigned int m5, unsigned int m6,
+                     void *v1, const void *v2, const void *v3)
+{
+    memcpy(buf, vfminmax_template, vfminmax_template_size);
+    buf[vfminmax_offset + 3] = (m6 << 4) | m5;
+    buf[vfminmax_offset + 4] &= 0x0F;
+    buf[vfminmax_offset + 4] |= (m4 << 4);
+    buf[vfminmax_offset + 5] = op;
+    ((void (*)(void *, const void *, const void *))buf)(v1, v2, v3);
+}
+
+/*
+ * Floating-point value classes.
+ */
+#define N_FORMATS 3
+#define N_SIGNED_CLASSES 8
+static const size_t float_sizes[N_FORMATS] = {
+    /* M4 == 2: short    */ 4,
+    /* M4 == 3: long     */ 8,
+    /* M4 == 4: extended */ 16,
+};
+static const size_t e_bits[N_FORMATS] = {
+    /* M4 == 2: short    */ 8,
+    /* M4 == 3: long     */ 11,
+    /* M4 == 4: extended */ 15,
+};
+static const unsigned char signed_floats[N_FORMATS][N_SIGNED_CLASSES][2][16] = {
+    /* M4 == 2: short */
+    {
+        /* -inf */ {{0xff, 0x80, 0x00, 0x00},
+                    {0xff, 0x80, 0x00, 0x00}},
+        /* -Fn */  {{0xc2, 0x28, 0x00, 0x00},
+                    {0xc2, 0x29, 0x00, 0x00}},
+        /* -0 */   {{0x80, 0x00, 0x00, 0x00},
+                    {0x80, 0x00, 0x00, 0x00}},
+        /* +0 */   {{0x00, 0x00, 0x00, 0x00},
+                    {0x00, 0x00, 0x00, 0x00}},
+        /* +Fn */  {{0x42, 0x28, 0x00, 0x00},
+                    {0x42, 0x2a, 0x00, 0x00}},
+        /* +inf */ {{0x7f, 0x80, 0x00, 0x00},
+                    {0x7f, 0x80, 0x00, 0x00}},
+        /* QNaN */ {{0x7f, 0xff, 0xff, 0xff},
+                    {0x7f, 0xff, 0xff, 0xfe}},
+        /* SNaN */ {{0x7f, 0xbf, 0xff, 0xff},
+                    {0x7f, 0xbf, 0xff, 0xfd}},
+    },
+
+    /* M4 == 3: long */
+    {
+        /* -inf */ {{0xff, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0xff, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* -Fn */  {{0xc0, 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0xc0, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* -0 */   {{0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* +0 */   {{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* +Fn */  {{0x40, 0x45, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x40, 0x47, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* +inf */ {{0x7f, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x7f, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* QNaN */ {{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+                    {0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe}},
+        /* SNaN */ {{0x7f, 0xf7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+                    {0x7f, 0xf7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfd}},
+    },
+
+    /* M4 == 4: extended */
+    {
+        /* -inf */ {{0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* -Fn */  {{0xc0, 0x04, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0xc0, 0x04, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* -0 */   {{0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* +0 */   {{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* +Fn */  {{0x40, 0x04, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x40, 0x04, 0x52, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* +inf */ {{0x7f, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+                    {0x7f, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}},
+        /* QNaN */ {{0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+                    {0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe}},
+        /* SNaN */ {{0x7f, 0xff, 0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
+                    {0x7f, 0xff, 0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfd}},
+    },
+};
+
+/*
+ * PoP tables as close to the original as possible.
+ */
+struct signed_test {
+    int op;
+    int m6;
+    const char *m6_desc;
+    const char *table[N_SIGNED_CLASSES][N_SIGNED_CLASSES];
+} signed_tests[] = {
+    {
+        .op = VFMIN,
+        .m6 = 0,
+        .m6_desc = "IEEE MinNum",
+        .table = {
+             /*         -inf         -Fn          -0           +0           +Fn          +inf         QNaN         SNaN     */
+            {/* -inf */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* -Fn  */ "T(b)",      "T(M(a,b))", "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* -0   */ "T(b)",      "T(b)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* +0   */ "T(b)",      "T(b)",      "T(b)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* +Fn  */ "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(M(a,b))", "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* +inf */ "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* QNaN */ "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(a)",      "Xi: T(b*)"},
+            {/* SNaN */ "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)"},
+        },
+    },
+    {
+        .op = VFMIN,
+        .m6 = 1,
+        .m6_desc = "JAVA Math.Min()",
+        .table = {
+             /*         -inf         -Fn          -0           +0           +Fn          +inf         QNaN         SNaN     */
+            {/* -inf */ "T(b)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(b)",      "Xi: T(b*)"},
+            {/* -Fn  */ "T(b)",      "T(M(a,b))", "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(b)",      "Xi: T(b*)"},
+            {/* -0   */ "T(b)",      "T(b)",      "T(b)",      "T(a)",      "T(a)",      "T(a)",      "T(b)",      "Xi: T(b*)"},
+            {/* +0   */ "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(a)",      "T(a)",      "T(b)",      "Xi: T(b*)"},
+            {/* +Fn  */ "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(M(a,b))", "T(a)",      "T(b)",      "Xi: T(b*)"},
+            {/* +inf */ "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "Xi: T(b*)"},
+            {/* QNaN */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* SNaN */ "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)"},
+        },
+    },
+    {
+        .op = VFMIN,
+        .m6 = 2,
+        .m6_desc = "C-style Min Macro",
+        .table = {
+             /*         -inf        -Fn          -0          +0          +Fn          +inf        QNaN        SNaN    */
+            {/* -inf */ "T(b)",     "T(a)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* -Fn  */ "T(b)",     "T(M(a,b))", "T(a)",     "T(a)",     "T(a)",      "T(a)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* -0   */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(a)",      "T(a)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* +0   */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(a)",      "T(a)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* +Fn  */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(M(a,b))", "T(a)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* +inf */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(a)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* QNaN */ "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)"},
+            {/* SNaN */ "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)"},
+        },
+    },
+    {
+        .op = VFMIN,
+        .m6 = 3,
+        .m6_desc = "C++ algorithm.min()",
+        .table = {
+             /*         -inf        -Fn          -0          +0          +Fn          +inf        QNaN        SNaN    */
+            {/* -inf */ "T(b)",     "T(a)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* -Fn  */ "T(b)",     "T(M(a,b))", "T(a)",     "T(a)",     "T(a)",      "T(a)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* -0   */ "T(b)",     "T(b)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* +0   */ "T(b)",     "T(b)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* +Fn  */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(M(a,b))", "T(a)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* +inf */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(a)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* QNaN */ "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)"},
+            {/* SNaN */ "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)"},
+        },
+    },
+    {
+        .op = VFMIN,
+        .m6 = 4,
+        .m6_desc = "fmin()",
+        .table = {
+             /*         -inf        -Fn          -0          +0          +Fn          +inf        QNaN        SNaN    */
+            {/* -inf */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "T(a)",     "Xi: T(a)"},
+            {/* -Fn  */ "T(b)",     "T(M(a,b))", "T(a)",     "T(a)",     "T(a)",      "T(a)",     "T(a)",     "Xi: T(a)"},
+            {/* -0   */ "T(b)",     "T(b)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "T(a)",     "Xi: T(a)"},
+            {/* +0   */ "T(b)",     "T(b)",      "T(b)",     "T(a)",     "T(a)",      "T(a)",     "T(a)",     "Xi: T(a)"},
+            {/* +Fn  */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(M(a,b))", "T(a)",     "T(a)",     "Xi: T(a)"},
+            {/* +inf */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(a)",     "T(a)",     "Xi: T(a)"},
+            {/* QNaN */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(b)",     "T(a)",     "Xi: T(a)"},
+            {/* SNaN */ "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(a)", "Xi: T(a)"},
+        },
+    },
+
+    {
+        .op = VFMAX,
+        .m6 = 0,
+        .m6_desc = "IEEE MaxNum",
+        .table = {
+             /*         -inf         -Fn          -0           +0           +Fn          +inf         QNaN         SNaN     */
+            {/* -inf */ "T(a)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(a)",      "Xi: T(b*)"},
+            {/* -Fn  */ "T(a)",      "T(M(a,b))", "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(a)",      "Xi: T(b*)"},
+            {/* -0   */ "T(a)",      "T(a)",      "T(a)",      "T(b)",      "T(b)",      "T(b)",      "T(a)",      "Xi: T(b*)"},
+            {/* +0   */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(b)",      "T(b)",      "T(a)",      "Xi: T(b*)"},
+            {/* +Fn  */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(M(a,b))", "T(b)",      "T(a)",      "Xi: T(b*)"},
+            {/* +inf */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* QNaN */ "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(a)",      "Xi: T(b*)"},
+            {/* SNaN */ "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)"},
+        },
+    },
+    {
+        .op = VFMAX,
+        .m6 = 1,
+        .m6_desc = "JAVA Math.Max()",
+        .table = {
+             /*         -inf         -Fn          -0           +0           +Fn          +inf         QNaN         SNaN     */
+            {/* -inf */ "T(a)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "Xi: T(b*)"},
+            {/* -Fn  */ "T(a)",      "T(M(a,b))", "T(b)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "Xi: T(b*)"},
+            {/* -0   */ "T(a)",      "T(a)",      "T(a)",      "T(b)",      "T(b)",      "T(b)",      "T(b)",      "Xi: T(b*)"},
+            {/* +0   */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(b)",      "T(b)",      "T(b)",      "Xi: T(b*)"},
+            {/* +Fn  */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(M(a,b))", "T(b)",      "T(b)",      "Xi: T(b*)"},
+            {/* +inf */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(b)",      "Xi: T(b*)"},
+            {/* QNaN */ "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "T(a)",      "Xi: T(b*)"},
+            {/* SNaN */ "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)", "Xi: T(a*)"},
+        },
+    },
+    {
+        .op = VFMAX,
+        .m6 = 2,
+        .m6_desc = "C-style Max Macro",
+        .table = {
+             /*         -inf        -Fn          -0          +0          +Fn          +inf        QNaN        SNaN    */
+            {/* -inf */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(b)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* -Fn  */ "T(a)",     "T(M(a,b))", "T(b)",     "T(b)",     "T(b)",      "T(b)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* -0   */ "T(a)",     "T(a)",      "T(b)",     "T(b)",     "T(b)",      "T(b)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* +0   */ "T(a)",     "T(a)",      "T(b)",     "T(b)",     "T(b)",      "T(b)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* +Fn  */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(M(a,b))", "T(b)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* +inf */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(a)",      "T(b)",     "Xi: T(b)", "Xi: T(b)"},
+            {/* QNaN */ "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)"},
+            {/* SNaN */ "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)"},
+        },
+    },
+    {
+        .op = VFMAX,
+        .m6 = 3,
+        .m6_desc = "C++ algorithm.max()",
+        .table = {
+             /*         -inf        -Fn          -0          +0          +Fn          +inf        QNaN        SNaN    */
+            {/* -inf */ "T(a)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(b)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* -Fn  */ "T(a)",     "T(M(a,b))", "T(b)",     "T(b)",     "T(b)",      "T(b)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* -0   */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(b)",      "T(b)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* +0   */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(b)",      "T(b)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* +Fn  */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(M(a,b))", "T(b)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* +inf */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "Xi: T(a)", "Xi: T(a)"},
+            {/* QNaN */ "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)"},
+            {/* SNaN */ "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)",  "Xi: T(a)", "Xi: T(a)", "Xi: T(a)"},
+        },
+    },
+    {
+        .op = VFMAX,
+        .m6 = 4,
+        .m6_desc = "fmax()",
+        .table = {
+             /*         -inf        -Fn          -0          +0          +Fn          +inf        QNaN        SNaN    */
+            {/* -inf */ "T(a)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(b)",     "T(a)",     "Xi: T(a)"},
+            {/* -Fn  */ "T(a)",     "T(M(a,b))", "T(b)",     "T(b)",     "T(b)",      "T(b)",     "T(a)",     "Xi: T(a)"},
+            {/* -0   */ "T(a)",     "T(a)",      "T(a)",     "T(b)",     "T(b)",      "T(b)",     "T(a)",     "Xi: T(a)"},
+            {/* +0   */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(b)",      "T(b)",     "T(a)",     "Xi: T(a)"},
+            {/* +Fn  */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(M(a,b))", "T(b)",     "T(a)",     "Xi: T(a)"},
+            {/* +inf */ "T(a)",     "T(a)",      "T(a)",     "T(a)",     "T(a)",      "T(a)",     "T(a)",     "Xi: T(a)"},
+            {/* QNaN */ "T(b)",     "T(b)",      "T(b)",     "T(b)",     "T(b)",      "T(b)",     "T(a)",     "Xi: T(a)"},
+            {/* SNaN */ "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(b)", "Xi: T(b)",  "Xi: T(b)", "Xi: T(a)", "Xi: T(a)"},
+        },
+    },
+};
+
+static void dump_v(FILE *f, const void *v, size_t n)
+{
+    for (int i = 0; i < n; i++) {
+        fprintf(f, "%02x", ((const unsigned char *)v)[i]);
+    }
+}
+
+static int signed_test(unsigned char *buf, struct signed_test *test,
+                       int m4, int m5,
+                       const void *v1_exp, bool xi_exp,
+                       const void *v2, const void *v3)
+{
+    size_t n = (m5 & 8) ? float_sizes[m4 - 2] : 16;
+    char v1[16];
+    bool xi;
+
+    feclearexcept(FE_ALL_EXCEPT);
+    vfminmax(buf, test->op, m4, m5, test->m6, v1, v2, v3);
+    xi = fetestexcept(FE_ALL_EXCEPT) == FE_INVALID;
+
+    if (memcmp(v1, v1_exp, n) != 0 || xi != xi_exp) {
+        fprintf(stderr, "[  FAILED  ] %s ", test->m6_desc);
+        dump_v(stderr, v2, n);
+        fprintf(stderr, ", ");
+        dump_v(stderr, v3, n);
+        fprintf(stderr, ", %d, %d, %d: actual=", m4, m5, test->m6);
+        dump_v(stderr, v1, n);
+        fprintf(stderr, "/%d, expected=", (int)xi);
+        dump_v(stderr, v1_exp, n);
+        fprintf(stderr, "/%d\n", (int)xi_exp);
+        return 1;
+    }
+
+    return 0;
+}
+
+static void snan_to_qnan(char *v, int m4)
+{
+    size_t bit = 1 + e_bits[m4 - 2];
+    v[bit / 8] |= 1 << (7 - (bit % 8));
+}
+
+int main(void)
+{
+    unsigned char *buf;
+    int ret = 0;
+    size_t i;
+
+    buf = mmap(NULL, 0x1000, PROT_READ | PROT_WRITE | PROT_EXEC,
+               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    if (buf == MAP_FAILED) {
+        perror("mmap() failed");
+        return 1;
+    }
+
+    for (i = 0; i < sizeof(signed_tests) / sizeof(signed_tests[0]); i++) {
+        struct signed_test *test = &signed_tests[i];
+        int m4;
+
+        for (m4 = 2; m4 <= 4; m4++) {
+            const unsigned char (*floats)[2][16] = signed_floats[m4 - 2];
+            size_t float_size = float_sizes[m4 - 2];
+            int m5;
+
+            for (m5 = 0; m5 <= 8; m5 += 8) {
+                char v1_exp[16], v2[16], v3[16];
+                bool xi_exp = false;
+                int pos = 0;
+                int i2;
+
+                for (i2 = 0; i2 < N_SIGNED_CLASSES * 2; i2++) {
+                    int i3;
+
+                    for (i3 = 0; i3 < N_SIGNED_CLASSES * 2; i3++) {
+                        const char *spec = test->table[i2 / 2][i3 / 2];
+
+                        memcpy(&v2[pos], floats[i2 / 2][i2 % 2], float_size);
+                        memcpy(&v3[pos], floats[i3 / 2][i3 % 2], float_size);
+                        if (strcmp(spec, "T(a)") == 0 ||
+                            strcmp(spec, "Xi: T(a)") == 0) {
+                            memcpy(&v1_exp[pos], &v2[pos], float_size);
+                        } else if (strcmp(spec, "T(b)") == 0 ||
+                                   strcmp(spec, "Xi: T(b)") == 0) {
+                            memcpy(&v1_exp[pos], &v3[pos], float_size);
+                        } else if (strcmp(spec, "Xi: T(a*)") == 0) {
+                            memcpy(&v1_exp[pos], &v2[pos], float_size);
+                            snan_to_qnan(&v1_exp[pos], m4);
+                        } else if (strcmp(spec, "Xi: T(b*)") == 0) {
+                            memcpy(&v1_exp[pos], &v3[pos], float_size);
+                            snan_to_qnan(&v1_exp[pos], m4);
+                        } else if (strcmp(spec, "T(M(a,b))") == 0) {
+                            /*
+                             * Comparing floats is risky, since the compiler
+                             * might generate the same instruction that we are
+                             * testing. Compare ints instead. This works,
+                             * because we get here only for +-Fn, and the
+                             * corresponding test values have identical
+                             * exponents.
+                             */
+                            int v2_int = *(int *)&v2[pos];
+                            int v3_int = *(int *)&v3[pos];
+
+                            if ((v2_int < v3_int) ==
+                                ((test->op == VFMIN) != (v2_int < 0))) {
+                                memcpy(&v1_exp[pos], &v2[pos], float_size);
+                            } else {
+                                memcpy(&v1_exp[pos], &v3[pos], float_size);
+                            }
+                        } else {
+                            fprintf(stderr, "Unexpected spec: %s\n", spec);
+                            return 1;
+                        }
+                        xi_exp |= spec[0] == 'X';
+                        pos += float_size;
+
+                        if ((m5 & 8) || pos == 16) {
+                            ret |= signed_test(buf, test, m4, m5,
+                                               v1_exp, xi_exp, v2, v3);
+                            pos = 0;
+                            xi_exp = false;
+                        }
+                    }
+                }
+
+                if (pos != 0) {
+                    ret |= signed_test(buf, test, m4, m5,
+                                       v1_exp, xi_exp, v2, v3);
+                }
+            }
+        }
+    }
+
+    munmap(buf, 0x1000);
+
+    return ret;
+}