==============================================
ilvev.b || 74.38 ms || 38.85 ms ||
ilvev.h || 46.78 ms || 33.98 ms ||
ilvev.w || 45.50 ms || 28.93 ms ||
ilvev.d || 37.67 ms || 23.09 ms ||
Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
---
target/mips/helper.h | 1 -
target/mips/msa_helper.c | 52 ---------------------
target/mips/translate.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 116 insertions(+), 54 deletions(-)
@@ -864,7 +864,6 @@ DEF_HELPER_5(msa_pckev_df, void, env, i32, i32, i32, i32)
DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
-DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
@@ -1311,58 +1311,6 @@ void helper_msa_pckev_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
}
}
-
-void helper_msa_ilvev_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
- uint32_t ws, uint32_t wt)
-{
- wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
- wr_t *pws = &(env->active_fpu.fpr[ws].wr);
- wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
-
- switch (df) {
- case DF_BYTE:
- pwd->b[15] = pws->b[14];
- pwd->b[14] = pwt->b[14];
- pwd->b[13] = pws->b[12];
- pwd->b[12] = pwt->b[12];
- pwd->b[11] = pws->b[10];
- pwd->b[10] = pwt->b[10];
- pwd->b[9] = pws->b[8];
- pwd->b[8] = pwt->b[8];
- pwd->b[7] = pws->b[6];
- pwd->b[6] = pwt->b[6];
- pwd->b[5] = pws->b[4];
- pwd->b[4] = pwt->b[4];
- pwd->b[3] = pws->b[2];
- pwd->b[2] = pwt->b[2];
- pwd->b[1] = pws->b[0];
- pwd->b[0] = pwt->b[0];
- break;
- case DF_HALF:
- pwd->h[7] = pws->h[6];
- pwd->h[6] = pwt->h[6];
- pwd->h[5] = pws->h[4];
- pwd->h[4] = pwt->h[4];
- pwd->h[3] = pws->h[2];
- pwd->h[2] = pwt->h[2];
- pwd->h[1] = pws->h[0];
- pwd->h[0] = pwt->h[0];
- break;
- case DF_WORD:
- pwd->w[3] = pws->w[2];
- pwd->w[2] = pwt->w[2];
- pwd->w[1] = pws->w[0];
- pwd->w[0] = pwt->w[0];
- break;
- case DF_DOUBLE:
- pwd->d[1] = pws->d[0];
- pwd->d[0] = pwt->d[0];
- break;
- default:
- assert(0);
- }
-}
-
void helper_msa_ilvl_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
uint32_t ws, uint32_t wt)
{
@@ -28991,6 +28991,106 @@ static inline void gen_ilvod_d(CPUMIPSState *env, uint32_t wd,
tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]);
}
+static inline void gen_ilvev_b(CPUMIPSState *env, uint32_t wd,
+ uint32_t ws, uint32_t wt) {
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ TCGv_i64 t1 = tcg_temp_new_i64();
+
+ uint64_t mask = (1ULL << 8) - 1;
+ mask |= mask << 16;
+ mask |= mask << 32;
+ tcg_gen_movi_i64(t1, 0);
+
+ tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+ tcg_gen_or_i64(t1, t1, t0);
+ tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+ tcg_gen_shli_i64(t0, t0, 8);
+ tcg_gen_or_i64(t1, t1, t0);
+
+ tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+ tcg_gen_movi_i64(t1, 0);
+
+ tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+ tcg_gen_or_i64(t1, t1, t0);
+ tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+ tcg_gen_shli_i64(t0, t0, 8);
+ tcg_gen_or_i64(t1, t1, t0);
+
+ tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+ tcg_temp_free_i64(t0);
+ tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvev_h(CPUMIPSState *env, uint32_t wd,
+ uint32_t ws, uint32_t wt) {
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ TCGv_i64 t1 = tcg_temp_new_i64();
+
+ uint64_t mask = (1ULL << 16) - 1;
+ mask |= mask << 32;
+
+ tcg_gen_movi_i64(t1, 0);
+
+ tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+ tcg_gen_or_i64(t1, t1, t0);
+ tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+ tcg_gen_shli_i64(t0, t0, 16);
+ tcg_gen_or_i64(t1, t1, t0);
+
+ tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+ tcg_gen_movi_i64(t1, 0);
+
+ tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+ tcg_gen_or_i64(t1, t1, t0);
+ tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+ tcg_gen_shli_i64(t0, t0, 16);
+ tcg_gen_or_i64(t1, t1, t0);
+
+ tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+ tcg_temp_free_i64(t0);
+ tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvev_w(CPUMIPSState *env, uint32_t wd,
+ uint32_t ws, uint32_t wt) {
+ TCGv_i64 t0 = tcg_temp_new_i64();
+ TCGv_i64 t1 = tcg_temp_new_i64();
+
+ uint64_t mask = (1ULL << 32) - 1;
+ tcg_gen_movi_i64(t1, 0);
+
+ tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+ tcg_gen_or_i64(t1, t1, t0);
+ tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+ tcg_gen_shli_i64(t0, t0, 32);
+ tcg_gen_or_i64(t1, t1, t0);
+
+ tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+ tcg_gen_movi_i64(t1, 0);
+
+ tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+ tcg_gen_or_i64(t1, t1, t0);
+ tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+ tcg_gen_shli_i64(t0, t0, 32);
+ tcg_gen_or_i64(t1, t1, t0);
+
+ tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+ tcg_temp_free_i64(t0);
+ tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvev_d(CPUMIPSState *env, uint32_t wd,
+ uint32_t ws, uint32_t wt) {
+ tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2]);
+ tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2]);
+}
+
static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
{
#define MASK_MSA_3R(op) (MASK_MSA_MINOR(op) | (op & (0x7 << 23)))
@@ -29147,7 +29247,22 @@ static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
gen_helper_msa_mod_s_df(cpu_env, tdf, twd, tws, twt);
break;
case OPC_ILVEV_df:
- gen_helper_msa_ilvev_df(cpu_env, tdf, twd, tws, twt);
+ switch (df) {
+ case DF_BYTE:
+ gen_ilvev_b(env, wd, ws, wt);
+ break;
+ case DF_HALF:
+ gen_ilvev_h(env, wd, ws, wt);
+ break;
+ case DF_WORD:
+ gen_ilvev_w(env, wd, ws, wt);
+ break;
+ case DF_DOUBLE:
+ gen_ilvev_d(env, wd, ws, wt);
+ break;
+ default:
+ assert(0);
+ }
break;
case OPC_BINSR_df:
gen_helper_msa_binsr_df(cpu_env, tdf, twd, tws, twt);
From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com> Optimize set of MSA instructions ILVEV, using directly tcg registers and performing logic on them insted of using helpers. Performance measurement is done by executing the instructions large number of times on a computer with Intel Core i7-3770 CPU @ 3.40GHz×8. instruction || before || after ||