diff mbox series

[2/2] target/mips: Optimize ILVEV.<B|H|W|D> MSA instructions

Message ID 1552651368-7422-3-git-send-email-mateja.marjanovic@rt-rk.com (mailing list archive)
State New, archived
Headers show
Series target/mips: Optimize ILVEV and ILVOD MSA instructions | expand

Commit Message

Mateja Marjanovic March 15, 2019, 12:02 p.m. UTC
From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>

Optimize set of MSA instructions ILVEV, using directly
tcg registers and performing logic on them insted of
using helpers.
Performance measurement is done by executing the
instructions large number of times on a computer
with Intel Core i7-3770 CPU @ 3.40GHz×8.

 instruction ||    before    ||    after   ||

Comments

Richard Henderson March 15, 2019, 4:44 p.m. UTC | #1
On 3/15/19 5:02 AM, Mateja Marjanovic wrote:
> +static inline void gen_ilvev_b(CPUMIPSState *env, uint32_t wd,
> +                               uint32_t ws, uint32_t wt) {
> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +
> +    uint64_t mask = (1ULL << 8) - 1;
> +    mask |= mask << 16;
> +    mask |= mask << 32;
> +    tcg_gen_movi_i64(t1, 0);

Identical comments as for the previous patch.


r~
diff mbox series

Patch

==============================================
 ilvev.b     ||    74.38 ms  ||  38.85 ms  ||
 ilvev.h     ||    46.78 ms  ||  33.98 ms  ||
 ilvev.w     ||    45.50 ms  ||  28.93 ms  ||
 ilvev.d     ||    37.67 ms  ||  23.09 ms  ||

Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
---
 target/mips/helper.h     |   1 -
 target/mips/msa_helper.c |  52 ---------------------
 target/mips/translate.c  | 117 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 116 insertions(+), 54 deletions(-)

diff --git a/target/mips/helper.h b/target/mips/helper.h
index d162836..2f23b0d 100644
--- a/target/mips/helper.h
+++ b/target/mips/helper.h
@@ -864,7 +864,6 @@  DEF_HELPER_5(msa_pckev_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
-DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
index cbcfd57..421dced 100644
--- a/target/mips/msa_helper.c
+++ b/target/mips/msa_helper.c
@@ -1311,58 +1311,6 @@  void helper_msa_pckev_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
     }
 }
 
-
-void helper_msa_ilvev_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
-                         uint32_t ws, uint32_t wt)
-{
-    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
-    wr_t *pws = &(env->active_fpu.fpr[ws].wr);
-    wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
-
-    switch (df) {
-    case DF_BYTE:
-        pwd->b[15] = pws->b[14];
-        pwd->b[14] = pwt->b[14];
-        pwd->b[13] = pws->b[12];
-        pwd->b[12] = pwt->b[12];
-        pwd->b[11] = pws->b[10];
-        pwd->b[10] = pwt->b[10];
-        pwd->b[9]  = pws->b[8];
-        pwd->b[8]  = pwt->b[8];
-        pwd->b[7]  = pws->b[6];
-        pwd->b[6]  = pwt->b[6];
-        pwd->b[5]  = pws->b[4];
-        pwd->b[4]  = pwt->b[4];
-        pwd->b[3]  = pws->b[2];
-        pwd->b[2]  = pwt->b[2];
-        pwd->b[1]  = pws->b[0];
-        pwd->b[0]  = pwt->b[0];
-        break;
-    case DF_HALF:
-        pwd->h[7] = pws->h[6];
-        pwd->h[6] = pwt->h[6];
-        pwd->h[5] = pws->h[4];
-        pwd->h[4] = pwt->h[4];
-        pwd->h[3] = pws->h[2];
-        pwd->h[2] = pwt->h[2];
-        pwd->h[1] = pws->h[0];
-        pwd->h[0] = pwt->h[0];
-        break;
-    case DF_WORD:
-        pwd->w[3] = pws->w[2];
-        pwd->w[2] = pwt->w[2];
-        pwd->w[1] = pws->w[0];
-        pwd->w[0] = pwt->w[0];
-        break;
-    case DF_DOUBLE:
-        pwd->d[1] = pws->d[0];
-        pwd->d[0] = pwt->d[0];
-        break;
-    default:
-        assert(0);
-    }
-}
-
 void helper_msa_ilvl_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
                         uint32_t ws, uint32_t wt)
 {
diff --git a/target/mips/translate.c b/target/mips/translate.c
index 101d2de..1526d24 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -28991,6 +28991,106 @@  static inline void gen_ilvod_d(CPUMIPSState *env, uint32_t wd,
     tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]);
 }
 
+static inline void gen_ilvev_b(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    uint64_t mask = (1ULL << 8) - 1;
+    mask |= mask << 16;
+    mask |= mask << 32;
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_shli_i64(t0, t0, 8);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_shli_i64(t0, t0, 8);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvev_h(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    uint64_t mask = (1ULL << 16) - 1;
+    mask |= mask << 32;
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_shli_i64(t0, t0, 16);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_shli_i64(t0, t0, 16);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvev_w(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    uint64_t mask = (1ULL << 32) - 1;
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_shli_i64(t0, t0, 32);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_shli_i64(t0, t0, 32);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvev_d(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2]);
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2]);
+}
+
 static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
 {
 #define MASK_MSA_3R(op)    (MASK_MSA_MINOR(op) | (op & (0x7 << 23)))
@@ -29147,7 +29247,22 @@  static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
         gen_helper_msa_mod_s_df(cpu_env, tdf, twd, tws, twt);
         break;
     case OPC_ILVEV_df:
-        gen_helper_msa_ilvev_df(cpu_env, tdf, twd, tws, twt);
+        switch (df) {
+        case DF_BYTE:
+            gen_ilvev_b(env, wd, ws, wt);
+            break;
+        case DF_HALF:
+            gen_ilvev_h(env, wd, ws, wt);
+            break;
+        case DF_WORD:
+            gen_ilvev_w(env, wd, ws, wt);
+            break;
+        case DF_DOUBLE:
+            gen_ilvev_d(env, wd, ws, wt);
+            break;
+        default:
+            assert(0);
+        }
         break;
     case OPC_BINSR_df:
         gen_helper_msa_binsr_df(cpu_env, tdf, twd, tws, twt);