diff mbox series

[v4,04/47] target/ppc: vmulh* instructions without helpers

Message ID 20220222143646.1268606-5-matheus.ferst@eldorado.org.br (mailing list archive)
State New, archived
Headers show
Series target/ppc: PowerISA Vector/VSX instruction batch | expand

Commit Message

Matheus K. Ferst Feb. 22, 2022, 2:36 p.m. UTC
From: "Lucas Mateus Castro (alqotel)" <lucas.castro@eldorado.org.br>

Changed vmulhuw, vmulhud, vmulhsw, vmulhsd to not
use helpers.

Signed-off-by: Lucas Mateus Castro (alqotel) <lucas.araujo@eldorado.org.br>
Signed-off-by: Matheus Ferst <matheus.ferst@eldorado.org.br>
---
Changes in v4:
Changed from gvec to i64, this resulted in a better performance on
a Power host for all 4 instructions and a better performance for
vmulhsw and vmulhuw in x86, but a worse performance for vmulhsd and
vmulhud in a x86 host.
---
 target/ppc/helper.h                 |   4 -
 target/ppc/int_helper.c             |  35 --------
 target/ppc/translate/vmx-impl.c.inc | 123 +++++++++++++++++++++++++++-
 3 files changed, 119 insertions(+), 43 deletions(-)

Comments

Richard Henderson Feb. 22, 2022, 6:23 p.m. UTC | #1
On 2/22/22 04:36, matheus.ferst@eldorado.org.br wrote:
> From: "Lucas Mateus Castro (alqotel)" <lucas.castro@eldorado.org.br>
> 
> Changed vmulhuw, vmulhud, vmulhsw, vmulhsd to not
> use helpers.
> 
> Signed-off-by: Lucas Mateus Castro (alqotel) <lucas.araujo@eldorado.org.br>
> Signed-off-by: Matheus Ferst <matheus.ferst@eldorado.org.br>
> ---
> Changes in v4:
> Changed from gvec to i64, this resulted in a better performance on
> a Power host for all 4 instructions and a better performance for
> vmulhsw and vmulhuw in x86, but a worse performance for vmulhsd and
> vmulhud in a x86 host.

Unsurprising.

> +static void do_vx_vmulhd_i64(TCGv_i64 t, TCGv_i64 a, TCGv_i64 b, bool sign)
> +{
> +    TCGv_i64 a1, b1, mask, w, k;
> +    void (*tcg_gen_shift_imm)(TCGv_i64, TCGv_i64, int64_t);
> +
> +    a1 = tcg_temp_new_i64();
> +    b1 = tcg_temp_new_i64();
> +    w  = tcg_temp_new_i64();
> +    k  = tcg_temp_new_i64();
> +    mask = tcg_temp_new_i64();
> +    if (sign) {
> +        tcg_gen_shift_imm = tcg_gen_sari_i64;
> +    } else {
> +        tcg_gen_shift_imm = tcg_gen_shri_i64;
> +    }
> +
> +    tcg_gen_movi_i64(mask, 0xFFFFFFFF);
> +    tcg_gen_and_i64(a1, a, mask);
> +    tcg_gen_and_i64(b1, b, mask);
> +    tcg_gen_mul_i64(t, a1, b1);
> +    tcg_gen_shri_i64(k, t, 32);
> +
> +    tcg_gen_shift_imm(a1, a, 32);
> +    tcg_gen_mul_i64(t, a1, b1);
> +    tcg_gen_add_i64(t, t, k);
> +    tcg_gen_and_i64(k, t, mask);
> +    tcg_gen_shift_imm(w, t, 32);
> +
> +    tcg_gen_and_i64(a1, a, mask);
> +    tcg_gen_shift_imm(b1, b, 32);
> +    tcg_gen_mul_i64(t, a1, b1);
> +    tcg_gen_add_i64(t, t, k);
> +    tcg_gen_shift_imm(k, t, 32);
> +
> +    tcg_gen_shift_imm(a1, a, 32);
> +    tcg_gen_mul_i64(t, a1, b1);
> +    tcg_gen_add_i64(t, t, w);
> +    tcg_gen_add_i64(t, t, k);

You should be using tcg_gen_mul{s,u}2_i64 instead of open-coding the high-part multiplication.

r~
diff mbox series

Patch

diff --git a/target/ppc/helper.h b/target/ppc/helper.h
index 5d11158f1f..d0c5a3fef1 100644
--- a/target/ppc/helper.h
+++ b/target/ppc/helper.h
@@ -206,10 +206,6 @@  DEF_HELPER_3(VMULOUB, void, avr, avr, avr)
 DEF_HELPER_3(VMULOUH, void, avr, avr, avr)
 DEF_HELPER_3(VMULOUW, void, avr, avr, avr)
 DEF_HELPER_3(VMULOUD, void, avr, avr, avr)
-DEF_HELPER_3(VMULHSW, void, avr, avr, avr)
-DEF_HELPER_3(VMULHUW, void, avr, avr, avr)
-DEF_HELPER_3(VMULHSD, void, avr, avr, avr)
-DEF_HELPER_3(VMULHUD, void, avr, avr, avr)
 DEF_HELPER_3(vslo, void, avr, avr, avr)
 DEF_HELPER_3(vsro, void, avr, avr, avr)
 DEF_HELPER_3(vsrv, void, avr, avr, avr)
diff --git a/target/ppc/int_helper.c b/target/ppc/int_helper.c
index 8ddeccef12..64c87d9418 100644
--- a/target/ppc/int_helper.c
+++ b/target/ppc/int_helper.c
@@ -1113,41 +1113,6 @@  void helper_VMULOUD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
     mulu64(&r->VsrD(1), &r->VsrD(0), a->VsrD(1), b->VsrD(1));
 }
 
-void helper_VMULHSW(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-    int i;
-
-    for (i = 0; i < 4; i++) {
-        r->s32[i] = (int32_t)(((int64_t)a->s32[i] * (int64_t)b->s32[i]) >> 32);
-    }
-}
-
-void helper_VMULHUW(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-    int i;
-
-    for (i = 0; i < 4; i++) {
-        r->u32[i] = (uint32_t)(((uint64_t)a->u32[i] *
-                               (uint64_t)b->u32[i]) >> 32);
-    }
-}
-
-void helper_VMULHSD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-    uint64_t discard;
-
-    muls64(&discard, &r->u64[0], a->s64[0], b->s64[0]);
-    muls64(&discard, &r->u64[1], a->s64[1], b->s64[1]);
-}
-
-void helper_VMULHUD(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
-{
-    uint64_t discard;
-
-    mulu64(&discard, &r->u64[0], a->u64[0], b->u64[0]);
-    mulu64(&discard, &r->u64[1], a->u64[1], b->u64[1]);
-}
-
 void helper_vperm(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b,
                   ppc_avr_t *c)
 {
diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc
index 62d0642226..3951ae124a 100644
--- a/target/ppc/translate/vmx-impl.c.inc
+++ b/target/ppc/translate/vmx-impl.c.inc
@@ -2126,10 +2126,125 @@  TRANS_FLAGS2(ISA310, VMULOSD, do_vx_helper, gen_helper_VMULOSD)
 TRANS_FLAGS2(ISA310, VMULEUD, do_vx_helper, gen_helper_VMULEUD)
 TRANS_FLAGS2(ISA310, VMULOUD, do_vx_helper, gen_helper_VMULOUD)
 
-TRANS_FLAGS2(ISA310, VMULHSW, do_vx_helper, gen_helper_VMULHSW)
-TRANS_FLAGS2(ISA310, VMULHSD, do_vx_helper, gen_helper_VMULHSD)
-TRANS_FLAGS2(ISA310, VMULHUW, do_vx_helper, gen_helper_VMULHUW)
-TRANS_FLAGS2(ISA310, VMULHUD, do_vx_helper, gen_helper_VMULHUD)
+static void do_vx_vmulhw_i64(TCGv_i64 t, TCGv_i64 a, TCGv_i64 b, bool sign)
+{
+    TCGv_i64 hh, lh, temp;
+
+    uint64_t c;
+    hh = tcg_temp_new_i64();
+    lh = tcg_temp_new_i64();
+    temp = tcg_temp_new_i64();
+
+    c = 0xFFFFFFFF;
+
+    if (sign) {
+        tcg_gen_ext32s_i64(lh, a);
+        tcg_gen_ext32s_i64(temp, b);
+    } else {
+        tcg_gen_andi_i64(lh, a, c);
+        tcg_gen_andi_i64(temp, b, c);
+    }
+    tcg_gen_mul_i64(lh, lh, temp);
+
+    if (sign) {
+        tcg_gen_sari_i64(hh, a, 32);
+        tcg_gen_sari_i64(temp, b, 32);
+    } else {
+        tcg_gen_shri_i64(hh, a, 32);
+        tcg_gen_shri_i64(temp, b, 32);
+    }
+    tcg_gen_mul_i64(hh, hh, temp);
+
+    tcg_gen_shri_i64(lh, lh, 32);
+    tcg_gen_andi_i64(hh, hh, c << 32);
+    tcg_gen_or_i64(t, hh, lh);
+
+    tcg_temp_free_i64(hh);
+    tcg_temp_free_i64(lh);
+    tcg_temp_free_i64(temp);
+}
+
+static void do_vx_vmulhd_i64(TCGv_i64 t, TCGv_i64 a, TCGv_i64 b, bool sign)
+{
+    TCGv_i64 a1, b1, mask, w, k;
+    void (*tcg_gen_shift_imm)(TCGv_i64, TCGv_i64, int64_t);
+
+    a1 = tcg_temp_new_i64();
+    b1 = tcg_temp_new_i64();
+    w  = tcg_temp_new_i64();
+    k  = tcg_temp_new_i64();
+    mask = tcg_temp_new_i64();
+    if (sign) {
+        tcg_gen_shift_imm = tcg_gen_sari_i64;
+    } else {
+        tcg_gen_shift_imm = tcg_gen_shri_i64;
+    }
+
+    tcg_gen_movi_i64(mask, 0xFFFFFFFF);
+    tcg_gen_and_i64(a1, a, mask);
+    tcg_gen_and_i64(b1, b, mask);
+    tcg_gen_mul_i64(t, a1, b1);
+    tcg_gen_shri_i64(k, t, 32);
+
+    tcg_gen_shift_imm(a1, a, 32);
+    tcg_gen_mul_i64(t, a1, b1);
+    tcg_gen_add_i64(t, t, k);
+    tcg_gen_and_i64(k, t, mask);
+    tcg_gen_shift_imm(w, t, 32);
+
+    tcg_gen_and_i64(a1, a, mask);
+    tcg_gen_shift_imm(b1, b, 32);
+    tcg_gen_mul_i64(t, a1, b1);
+    tcg_gen_add_i64(t, t, k);
+    tcg_gen_shift_imm(k, t, 32);
+
+    tcg_gen_shift_imm(a1, a, 32);
+    tcg_gen_mul_i64(t, a1, b1);
+    tcg_gen_add_i64(t, t, w);
+    tcg_gen_add_i64(t, t, k);
+
+    tcg_temp_free_i64(a1);
+    tcg_temp_free_i64(b1);
+    tcg_temp_free_i64(w);
+    tcg_temp_free_i64(k);
+    tcg_temp_free_i64(mask);
+}
+
+static bool do_vx_mulh(DisasContext *ctx, arg_VX *a, bool sign,
+                       void (*func)(TCGv_i64, TCGv_i64, TCGv_i64, bool))
+{
+    REQUIRE_INSNS_FLAGS2(ctx, ISA310);
+    REQUIRE_VECTOR(ctx);
+
+    TCGv_i64 vra, vrb, vrt;
+    int i;
+
+    vra = tcg_temp_new_i64();
+    vrb = tcg_temp_new_i64();
+    vrt = tcg_temp_new_i64();
+
+    for (i = 0; i < 2; i++) {
+        get_avr64(vra, a->vra, i);
+        get_avr64(vrb, a->vrb, i);
+        get_avr64(vrt, a->vrt, i);
+
+        func(vrt, vra, vrb, sign);
+
+        set_avr64(a->vrt, vrt, i);
+    }
+
+    tcg_temp_free_i64(vra);
+    tcg_temp_free_i64(vrb);
+    tcg_temp_free_i64(vrt);
+
+    return true;
+
+}
+
+TRANS(VMULHSW, do_vx_mulh, true , do_vx_vmulhw_i64)
+TRANS(VMULHSD, do_vx_mulh, true , do_vx_vmulhd_i64)
+TRANS(VMULHUW, do_vx_mulh, false, do_vx_vmulhw_i64)
+TRANS(VMULHUD, do_vx_mulh, false, do_vx_vmulhd_i64)
 
 #undef GEN_VR_LDX
 #undef GEN_VR_STX