diff mbox series

[v1,04/20] s390x/tcg: Implement 32/128 bit for VECTOR FP ADD

Message ID 20200930145523.71087-5-david@redhat.com (mailing list archive)
State New, archived
Headers show
Series s390x/tcg: Implement Vector enhancements facility and switch to z14 | expand

Commit Message

David Hildenbrand Sept. 30, 2020, 2:55 p.m. UTC
In case of 128bit, we always have a single element.

Add some helpers that allow to generically read/write 32/64/128 bit
floats. Convert the existing implementation of vop64_3 into a macro that
deals with float* instead of uint* instead - the other users keep
working as
    typedef uint32_t float32;
    typedef uint64_t float64;

Most of them will get converted next.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 target/s390x/helper.h           |   3 +
 target/s390x/translate_vx.c.inc |  31 +++++++--
 target/s390x/vec_fpu_helper.c   | 119 +++++++++++++++++++++-----------
 3 files changed, 107 insertions(+), 46 deletions(-)

Comments

Richard Henderson Oct. 1, 2020, 3:45 p.m. UTC | #1
On 9/30/20 9:55 AM, David Hildenbrand wrote:
> -typedef uint64_t (*vop64_3_fn)(uint64_t a, uint64_t b, float_status *s);
> -static void vop64_3(S390Vector *v1, const S390Vector *v2, const S390Vector *v3,
> -                    CPUS390XState *env, bool s, vop64_3_fn fn,
> -                    uintptr_t retaddr)
> -{
> -    uint8_t vxc, vec_exc = 0;
> -    S390Vector tmp = {};
> -    int i;
> -
> -    for (i = 0; i < 2; i++) {
> -        const uint64_t a = s390_vec_read_element64(v2, i);
> -        const uint64_t b = s390_vec_read_element64(v3, i);
> -
> -        s390_vec_write_element64(&tmp, i, fn(a, b, &env->fpu_status));
> -        vxc = check_ieee_exc(env, i, false, &vec_exc);
> -        if (s || vxc) {
> -            break;
> -        }
> -    }
> -    handle_ieee_exc(env, vxc, vec_exc, retaddr);
> -    *v1 = tmp;
> -}
...
> +#define DEF_VOP_3(BITS)                                                        \
> +typedef float##BITS (*vop##BITS##_3_fn)(float##BITS a, float##BITS b,          \
> +                                        float_status *s);                      \
> +static void vop##BITS##_3(S390Vector *v1, const S390Vector *v2,                \
> +                          const S390Vector *v3, CPUS390XState *env, bool s,    \
> +                          vop##BITS##_3_fn fn, uintptr_t retaddr)              \
> +{                                                                              \
> +    uint8_t vxc, vec_exc = 0;                                                  \
> +    S390Vector tmp = {};                                                       \
> +    int i;                                                                     \
> +                                                                               \
> +    for (i = 0; i < (128 / BITS); i++) {                                       \
> +        const float##BITS a = s390_vec_read_float##BITS(v2, i);                \
> +        const float##BITS b = s390_vec_read_float##BITS(v3, i);                \
> +                                                                               \
> +        s390_vec_write_float##BITS(&tmp, i, fn(a, b, &env->fpu_status));       \
> +        vxc = check_ieee_exc(env, i, false, &vec_exc);                         \
> +        if (s || vxc) {                                                        \
> +            break;                                                             \
> +        }                                                                      \
> +    }                                                                          \
> +    handle_ieee_exc(env, vxc, vec_exc, retaddr);                               \
> +    *v1 = tmp;                                                                 \
> +}
> +DEF_VOP_3(32)
> +DEF_VOP_3(64)
> +DEF_VOP_3(128)

While this works, you won't be able to step through this function in the
debugger anymore, because it now has one source line: at the point of expansion.

We do have plenty of these around the code base, I know.  This is small enough
that I think it's reasonable to simply have three copies, one for each type.

> +#define DEF_GVEC_FVA(BITS)                                                     \
> +void HELPER(gvec_vfa##BITS)(void *v1, const void *v2, const void *v3,          \
> +                            CPUS390XState *env, uint32_t desc)                 \
> +{                                                                              \
> +    vop##BITS##_3(v1, v2, v3, env, false, float##BITS##_add, GETPC());         \
> +}
> +DEF_GVEC_FVA(32)
> +DEF_GVEC_FVA(64)
> +DEF_GVEC_FVA(128)
> +
> +#define DEF_GVEC_FVA_S(BITS)                                                   \
> +void HELPER(gvec_vfa##BITS##s)(void *v1, const void *v2, const void *v3,       \
> +                               CPUS390XState *env, uint32_t desc)              \
> +{                                                                              \
> +    vop##BITS##_3(v1, v2, v3, env, true, float##BITS##_add, GETPC());          \
> +}
> +DEF_GVEC_FVA_S(32)
> +DEF_GVEC_FVA_S(64)
I think you're defining these macros with the wrong parameters.  Think of how
to use the same macros for all of add/sub/etc.

E.g.

#define DEF_FOP3_B(NAME, OP, BITS) \...
  void HELPER(gvec_##NAME##BITS)(void *v1, const void *v2,
      const void *v3, CPUS390XState *env, uint32_t desc)
  {
    vop##BITS##_3(v1, v2, v3, env, false,
                  float##BITS##_##OP, GETPC());
  }
  void HELPER(gvec_##NAME##BITS##s)(void *v1, const void *v2,
      const void *v3, CPUS390XState *env, uint32_t desc)
  {
    vop##BITS##_3(v1, v2, v3, env, true,
                  float##BITS##_##OP, GETPC());
  }

#define DEF_FOP3(NAME, OP) \
  DEF_FOP3_B(NAME, OP, 32) \
  DEF_FOP3_B(NAME, OP, 64) \
  DEF_FOP3_B(NAME, OP, 128)

DEF_FOP3(vfa, add)
DEF_FOP3(vfd, div)
DEF_FOP3(vfm, mul)

etc.


r~
Richard Henderson Oct. 1, 2020, 4:08 p.m. UTC | #2
On 9/30/20 9:55 AM, David Hildenbrand wrote:
> +        case FPF_LONG:
> +            fn = se ? gen_helper_gvec_vfa64s : gen_helper_gvec_vfa64;
> +            break;

BTW, any reason not to pass SE as data, like you do later for SQ?  Or
potentially the entire M field as is?

Just wondering if it would help tidy up here...


r~
David Hildenbrand Oct. 1, 2020, 5:08 p.m. UTC | #3
On 01.10.20 18:08, Richard Henderson wrote:
> On 9/30/20 9:55 AM, David Hildenbrand wrote:
>> +        case FPF_LONG:
>> +            fn = se ? gen_helper_gvec_vfa64s : gen_helper_gvec_vfa64;
>> +            break;
> 
> BTW, any reason not to pass SE as data, like you do later for SQ?  Or
> potentially the entire M field as is?

Having a separate implementation for "se" is desirable, because the
compiler can optimize-out the complete loop. If we simply pass the M
field to the helper, I'm not sure how likely it is that the compiler
will specialize (would have to double check).

(if we decide to remove all "s" helpers here, we'd better do it for all
 helpers)

> 
> Just wondering if it would help tidy up here...
diff mbox series

Patch

diff --git a/target/s390x/helper.h b/target/s390x/helper.h
index f579fd38a7..3d59f143e0 100644
--- a/target/s390x/helper.h
+++ b/target/s390x/helper.h
@@ -247,8 +247,11 @@  DEF_HELPER_6(gvec_vstrc_cc_rt16, void, ptr, cptr, cptr, cptr, env, i32)
 DEF_HELPER_6(gvec_vstrc_cc_rt32, void, ptr, cptr, cptr, cptr, env, i32)
 
 /* === Vector Floating-Point Instructions */
+DEF_HELPER_FLAGS_5(gvec_vfa32, TCG_CALL_NO_WG, void, ptr, cptr, cptr, env, i32)
+DEF_HELPER_FLAGS_5(gvec_vfa32s, TCG_CALL_NO_WG, void, ptr, cptr, cptr, env, i32)
 DEF_HELPER_FLAGS_5(gvec_vfa64, TCG_CALL_NO_WG, void, ptr, cptr, cptr, env, i32)
 DEF_HELPER_FLAGS_5(gvec_vfa64s, TCG_CALL_NO_WG, void, ptr, cptr, cptr, env, i32)
+DEF_HELPER_FLAGS_5(gvec_vfa128, TCG_CALL_NO_WG, void, ptr, cptr, cptr, env, i32)
 DEF_HELPER_4(gvec_wfc64, void, cptr, cptr, env, i32)
 DEF_HELPER_4(gvec_wfk64, void, cptr, cptr, env, i32)
 DEF_HELPER_FLAGS_5(gvec_vfce64, TCG_CALL_NO_WG, void, ptr, cptr, cptr, env, i32)
diff --git a/target/s390x/translate_vx.c.inc b/target/s390x/translate_vx.c.inc
index 4c1b430013..2ba2170b16 100644
--- a/target/s390x/translate_vx.c.inc
+++ b/target/s390x/translate_vx.c.inc
@@ -2504,16 +2504,27 @@  static DisasJumpType op_vfa(DisasContext *s, DisasOps *o)
     const uint8_t fpf = get_field(s, m4);
     const uint8_t m5 = get_field(s, m5);
     const bool se = extract32(m5, 3, 1);
-    gen_helper_gvec_3_ptr *fn;
-
-    if (fpf != FPF_LONG || extract32(m5, 0, 3)) {
-        gen_program_exception(s, PGM_SPECIFICATION);
-        return DISAS_NORETURN;
-    }
+    gen_helper_gvec_3_ptr *fn = NULL;
 
     switch (s->fields.op2) {
     case 0xe3:
-        fn = se ? gen_helper_gvec_vfa64s : gen_helper_gvec_vfa64;
+        switch (fpf) {
+        case FPF_SHORT:
+            if (s390_has_feat(S390_FEAT_VECTOR_ENH)) {
+                fn = se ? gen_helper_gvec_vfa32s : gen_helper_gvec_vfa32;
+            }
+            break;
+        case FPF_LONG:
+            fn = se ? gen_helper_gvec_vfa64s : gen_helper_gvec_vfa64;
+            break;
+        case FPF_EXT:
+            if (s390_has_feat(S390_FEAT_VECTOR_ENH)) {
+                fn = gen_helper_gvec_vfa128;
+            }
+            break;
+        default:
+            break;
+        }
         break;
     case 0xe5:
         fn = se ? gen_helper_gvec_vfd64s : gen_helper_gvec_vfd64;
@@ -2527,6 +2538,12 @@  static DisasJumpType op_vfa(DisasContext *s, DisasOps *o)
     default:
         g_assert_not_reached();
     }
+
+    if (!fn || extract32(m5, 0, 3)) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return DISAS_NORETURN;
+    }
+
     gen_gvec_3_ptr(get_field(s, v1), get_field(s, v2),
                    get_field(s, v3), cpu_env, 0, fn);
     return DISAS_NEXT;
diff --git a/target/s390x/vec_fpu_helper.c b/target/s390x/vec_fpu_helper.c
index c1564e819b..ae803ba602 100644
--- a/target/s390x/vec_fpu_helper.c
+++ b/target/s390x/vec_fpu_helper.c
@@ -78,6 +78,40 @@  static void handle_ieee_exc(CPUS390XState *env, uint8_t vxc, uint8_t vec_exc,
     }
 }
 
+static float32 s390_vec_read_float32(const S390Vector *v, uint8_t enr)
+{
+    return make_float32(s390_vec_read_element32(v, enr));
+}
+
+static float64 s390_vec_read_float64(const S390Vector *v, uint8_t enr)
+{
+    return make_float64(s390_vec_read_element64(v, enr));
+}
+
+static float128 s390_vec_read_float128(const S390Vector *v, uint8_t enr)
+{
+    g_assert(enr == 0);
+    return make_float128(s390_vec_read_element64(v, 0),
+                         s390_vec_read_element64(v, 1));
+}
+
+static void s390_vec_write_float32(S390Vector *v, uint8_t enr, float32 data)
+{
+    return s390_vec_write_element32(v, enr, data);
+}
+
+static void s390_vec_write_float64(S390Vector *v, uint8_t enr, float64 data)
+{
+    return s390_vec_write_element64(v, enr, data);
+}
+
+static void s390_vec_write_float128(S390Vector *v, uint8_t enr, float128 data)
+{
+    g_assert(enr == 0);
+    s390_vec_write_element64(v, 0, data.high);
+    s390_vec_write_element64(v, 1, data.low);
+}
+
 typedef uint64_t (*vop64_2_fn)(uint64_t a, float_status *s);
 static void vop64_2(S390Vector *v1, const S390Vector *v2, CPUS390XState *env,
                     bool s, bool XxC, uint8_t erm, vop64_2_fn fn,
@@ -102,45 +136,52 @@  static void vop64_2(S390Vector *v1, const S390Vector *v2, CPUS390XState *env,
     *v1 = tmp;
 }
 
-typedef uint64_t (*vop64_3_fn)(uint64_t a, uint64_t b, float_status *s);
-static void vop64_3(S390Vector *v1, const S390Vector *v2, const S390Vector *v3,
-                    CPUS390XState *env, bool s, vop64_3_fn fn,
-                    uintptr_t retaddr)
-{
-    uint8_t vxc, vec_exc = 0;
-    S390Vector tmp = {};
-    int i;
-
-    for (i = 0; i < 2; i++) {
-        const uint64_t a = s390_vec_read_element64(v2, i);
-        const uint64_t b = s390_vec_read_element64(v3, i);
-
-        s390_vec_write_element64(&tmp, i, fn(a, b, &env->fpu_status));
-        vxc = check_ieee_exc(env, i, false, &vec_exc);
-        if (s || vxc) {
-            break;
-        }
-    }
-    handle_ieee_exc(env, vxc, vec_exc, retaddr);
-    *v1 = tmp;
-}
-
-static uint64_t vfa64(uint64_t a, uint64_t b, float_status *s)
-{
-    return float64_add(a, b, s);
-}
-
-void HELPER(gvec_vfa64)(void *v1, const void *v2, const void *v3,
-                        CPUS390XState *env, uint32_t desc)
-{
-    vop64_3(v1, v2, v3, env, false, vfa64, GETPC());
-}
-
-void HELPER(gvec_vfa64s)(void *v1, const void *v2, const void *v3,
-                         CPUS390XState *env, uint32_t desc)
-{
-    vop64_3(v1, v2, v3, env, true, vfa64, GETPC());
-}
+#define DEF_VOP_3(BITS)                                                        \
+typedef float##BITS (*vop##BITS##_3_fn)(float##BITS a, float##BITS b,          \
+                                        float_status *s);                      \
+static void vop##BITS##_3(S390Vector *v1, const S390Vector *v2,                \
+                          const S390Vector *v3, CPUS390XState *env, bool s,    \
+                          vop##BITS##_3_fn fn, uintptr_t retaddr)              \
+{                                                                              \
+    uint8_t vxc, vec_exc = 0;                                                  \
+    S390Vector tmp = {};                                                       \
+    int i;                                                                     \
+                                                                               \
+    for (i = 0; i < (128 / BITS); i++) {                                       \
+        const float##BITS a = s390_vec_read_float##BITS(v2, i);                \
+        const float##BITS b = s390_vec_read_float##BITS(v3, i);                \
+                                                                               \
+        s390_vec_write_float##BITS(&tmp, i, fn(a, b, &env->fpu_status));       \
+        vxc = check_ieee_exc(env, i, false, &vec_exc);                         \
+        if (s || vxc) {                                                        \
+            break;                                                             \
+        }                                                                      \
+    }                                                                          \
+    handle_ieee_exc(env, vxc, vec_exc, retaddr);                               \
+    *v1 = tmp;                                                                 \
+}
+DEF_VOP_3(32)
+DEF_VOP_3(64)
+DEF_VOP_3(128)
+
+#define DEF_GVEC_FVA(BITS)                                                     \
+void HELPER(gvec_vfa##BITS)(void *v1, const void *v2, const void *v3,          \
+                            CPUS390XState *env, uint32_t desc)                 \
+{                                                                              \
+    vop##BITS##_3(v1, v2, v3, env, false, float##BITS##_add, GETPC());         \
+}
+DEF_GVEC_FVA(32)
+DEF_GVEC_FVA(64)
+DEF_GVEC_FVA(128)
+
+#define DEF_GVEC_FVA_S(BITS)                                                   \
+void HELPER(gvec_vfa##BITS##s)(void *v1, const void *v2, const void *v3,       \
+                               CPUS390XState *env, uint32_t desc)              \
+{                                                                              \
+    vop##BITS##_3(v1, v2, v3, env, true, float##BITS##_add, GETPC());          \
+}
+DEF_GVEC_FVA_S(32)
+DEF_GVEC_FVA_S(64)
 
 static int wfc64(const S390Vector *v1, const S390Vector *v2,
                  CPUS390XState *env, bool signal, uintptr_t retaddr)