diff mbox series

[1/2] target/mips: Optimize ILVOD.<B|H|W|D> MSA instructions

Message ID 1552651368-7422-2-git-send-email-mateja.marjanovic@rt-rk.com (mailing list archive)
State New, archived
Headers show
Series target/mips: Optimize ILVEV and ILVOD MSA instructions | expand

Commit Message

Mateja Marjanovic March 15, 2019, 12:02 p.m. UTC
From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>

Optimize set of MSA instructions ILVOD, using directly
tcg registers and performing logic on them insted of
using helpers.
Performance measurement is done by executing the
instructions large number of times on a computer
with Intel Core i7-3770 CPU @ 3.40GHz×8.

 instruction ||    before    ||    after   ||

Comments

Richard Henderson March 15, 2019, 4:43 p.m. UTC | #1
On 3/15/19 5:02 AM, Mateja Marjanovic wrote:
> From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>
> 
> Optimize set of MSA instructions ILVOD, using directly
> tcg registers and performing logic on them insted of
> using helpers.
> Performance measurement is done by executing the
> instructions large number of times on a computer
> with Intel Core i7-3770 CPU @ 3.40GHz×8.
> 
>  instruction ||    before    ||    after   ||
> ==============================================
>  ilvod.b:	 ||   66.97 ms   ||  26.34 ms  ||
>  ilvod.h:	 ||   44.75 ms   ||  25.17 ms  ||
>  ilvod.w:	 ||   41.27 ms   ||  24.37 ms  ||
>  ilvod.d:	 ||   41.75 ms   ||  20.50 ms  ||
> 
> Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
> ---
>  target/mips/helper.h     |   1 -
>  target/mips/msa_helper.c |  51 --------------------
>  target/mips/translate.c  | 119 ++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 118 insertions(+), 53 deletions(-)
> 
> diff --git a/target/mips/helper.h b/target/mips/helper.h
> index a6d687e..d162836 100644
> --- a/target/mips/helper.h
> +++ b/target/mips/helper.h
> @@ -865,7 +865,6 @@ DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
> -DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
> index 9d9dafe..cbcfd57 100644
> --- a/target/mips/msa_helper.c
> +++ b/target/mips/msa_helper.c
> @@ -1363,57 +1363,6 @@ void helper_msa_ilvev_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
>      }
>  }
>  
> -void helper_msa_ilvod_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
> -                         uint32_t ws, uint32_t wt)
> -{
> -    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
> -    wr_t *pws = &(env->active_fpu.fpr[ws].wr);
> -    wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
> -
> -    switch (df) {
> -    case DF_BYTE:
> -        pwd->b[0]  = pwt->b[1];
> -        pwd->b[1]  = pws->b[1];
> -        pwd->b[2]  = pwt->b[3];
> -        pwd->b[3]  = pws->b[3];
> -        pwd->b[4]  = pwt->b[5];
> -        pwd->b[5]  = pws->b[5];
> -        pwd->b[6]  = pwt->b[7];
> -        pwd->b[7]  = pws->b[7];
> -        pwd->b[8]  = pwt->b[9];
> -        pwd->b[9]  = pws->b[9];
> -        pwd->b[10] = pwt->b[11];
> -        pwd->b[11] = pws->b[11];
> -        pwd->b[12] = pwt->b[13];
> -        pwd->b[13] = pws->b[13];
> -        pwd->b[14] = pwt->b[15];
> -        pwd->b[15] = pws->b[15];
> -        break;
> -    case DF_HALF:
> -        pwd->h[0] = pwt->h[1];
> -        pwd->h[1] = pws->h[1];
> -        pwd->h[2] = pwt->h[3];
> -        pwd->h[3] = pws->h[3];
> -        pwd->h[4] = pwt->h[5];
> -        pwd->h[5] = pws->h[5];
> -        pwd->h[6] = pwt->h[7];
> -        pwd->h[7] = pws->h[7];
> -        break;
> -    case DF_WORD:
> -        pwd->w[0] = pwt->w[1];
> -        pwd->w[1] = pws->w[1];
> -        pwd->w[2] = pwt->w[3];
> -        pwd->w[3] = pws->w[3];
> -        break;
> -    case DF_DOUBLE:
> -        pwd->d[0] = pwt->d[1];
> -        pwd->d[1] = pws->d[1];
> -        break;
> -    default:
> -        assert(0);
> -    }
> -}
> -
>  void helper_msa_ilvl_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
>                          uint32_t ws, uint32_t wt)
>  {
> diff --git a/target/mips/translate.c b/target/mips/translate.c
> index b4a1103..101d2de 100644
> --- a/target/mips/translate.c
> +++ b/target/mips/translate.c
> @@ -28889,6 +28889,108 @@ static void gen_msa_bit(CPUMIPSState *env, DisasContext *ctx)
>      tcg_temp_free_i32(tws);
>  }
>  
> +static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd,
> +                               uint32_t ws, uint32_t wt) {

{ on next line.

> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +
> +    uint64_t mask = (1ULL << 8) - 1;
> +    mask |= mask << 16;
> +    mask |= mask << 32;
> +    mask <<= 8;

This is a constant.  Clearer to just write 0xff00ff00ff00ff00ull;

> +static inline void gen_ilvod_h(CPUMIPSState *env, uint32_t wd,
> +                               uint32_t ws, uint32_t wt) {

Likewise.

> +    uint64_t mask = (1ULL << 16) - 1;
> +    mask |= mask << 32;
> +    mask <<= 16;

0xffff0000ffff0000ull

> +static inline void gen_ilvod_w(CPUMIPSState *env, uint32_t wd,
> +                               uint32_t ws, uint32_t wt) {

Likewise.

> +    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
> +    tcg_gen_shri_i64(t0, t0, 32);
> +    tcg_gen_or_i64(t1, t1, t0);
> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
> +    tcg_gen_or_i64(t1, t1, t0);

This can fold down to deposit.

    tcg_gen_shri_i64(t0, msa_wr_d[wt * 2], 32);
    tcg_gen_deposit_i64(msa_wr_d[wd * 2], msa_wr_d[ws * 2], t0, 0, 32);

> +static inline void gen_ilvod_d(CPUMIPSState *env, uint32_t wd,
> +                               uint32_t ws, uint32_t wt) {

Likewise.


r~
Aleksandar Markovic March 15, 2019, 4:46 p.m. UTC | #2
> From: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
> Subject: [PATCH 1/2] target/mips: Optimize ILVOD.<B|H|W|D> MSA instructions
> 
> From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>
> 
> Optimize set of MSA instructions ILVOD, using directly
> tcg registers and performing logic on them insted of
> using helpers.

insted -> instead

> Performance measurement is done by executing the
> instructions large number of times on a computer
> with Intel Core i7-3770 CPU @ 3.40GHz×8.
> 
>  instruction ||    before    ||    after   ||
> ==============================================
>  ilvod.b:        ||   66.97 ms   ||  26.34 ms  ||
>  ilvod.h:        ||   44.75 ms   ||  25.17 ms  ||
>  ilvod.w:        ||   41.27 ms   ||  24.37 ms  ||
>  ilvod.d:        ||   41.75 ms   ||  20.50 ms  ||
> 

Alignment looks wrong.

> Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
> ---
>  target/mips/helper.h     |   1 -
>  target/mips/msa_helper.c |  51 --------------------
>  target/mips/translate.c  | 119 ++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 118 insertions(+), 53 deletions(-)
> 
> diff --git a/target/mips/helper.h b/target/mips/helper.h
> index a6d687e..d162836 100644
> --- a/target/mips/helper.h
> +++ b/target/mips/helper.h
> @@ -865,7 +865,6 @@ DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
> -DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
> index 9d9dafe..cbcfd57 100644
> --- a/target/mips/msa_helper.c
> +++ b/target/mips/msa_helper.c
> @@ -1363,57 +1363,6 @@ void helper_msa_ilvev_df(CPUMIPSState *env, uint32_t df, uint32_t > wd,
>      }
>  }
> 
> -void helper_msa_ilvod_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
> -                         uint32_t ws, uint32_t wt)
> -{
> -    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
> -    wr_t *pws = &(env->active_fpu.fpr[ws].wr);
> -    wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
> -
> -    switch (df) {
> -    case DF_BYTE:
> -        pwd->b[0]  = pwt->b[1];
> -        pwd->b[1]  = pws->b[1];
> -        pwd->b[2]  = pwt->b[3];
> -        pwd->b[3]  = pws->b[3];
> -        pwd->b[4]  = pwt->b[5];
> -        pwd->b[5]  = pws->b[5];
> -        pwd->b[6]  = pwt->b[7];
> -        pwd->b[7]  = pws->b[7];
> -        pwd->b[8]  = pwt->b[9];
> -        pwd->b[9]  = pws->b[9];
> -        pwd->b[10] = pwt->b[11];
> -        pwd->b[11] = pws->b[11];
> -        pwd->b[12] = pwt->b[13];
> -        pwd->b[13] = pws->b[13];
> -        pwd->b[14] = pwt->b[15];
> -        pwd->b[15] = pws->b[15];
> -        break;
> -    case DF_HALF:
> -        pwd->h[0] = pwt->h[1];
> -        pwd->h[1] = pws->h[1];
> -        pwd->h[2] = pwt->h[3];
> -        pwd->h[3] = pws->h[3];
> -        pwd->h[4] = pwt->h[5];
> -        pwd->h[5] = pws->h[5];
> -        pwd->h[6] = pwt->h[7];
> -        pwd->h[7] = pws->h[7];
> -        break;
> -    case DF_WORD:
> -        pwd->w[0] = pwt->w[1];
> -        pwd->w[1] = pws->w[1];
> -        pwd->w[2] = pwt->w[3];
> -        pwd->w[3] = pws->w[3];
> -        break;
> -    case DF_DOUBLE:
> -        pwd->d[0] = pwt->d[1];
> -        pwd->d[1] = pws->d[1];
> -        break;
> -    default:
> -        assert(0);
> -    }
> -}
> -
>  void helper_msa_ilvl_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
>                          uint32_t ws, uint32_t wt)
>  {
> diff --git a/target/mips/translate.c b/target/mips/translate.c
> index b4a1103..101d2de 100644
> --- a/target/mips/translate.c
> +++ b/target/mips/translate.c
> @@ -28889,6 +28889,108 @@ static void gen_msa_bit(CPUMIPSState *env, DisasContext *ctx)
>      tcg_temp_free_i32(tws);
>  }
> 
> +static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd,
> +                               uint32_t ws, uint32_t wt) {

Please insert a brief comment before each handler, like in this example:

/*
 * [MSA] ILVOD.B wd, ws, wt
 *
 *   Vector Interleave Odd (byte data elements)
 *
 */

> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +
> +    uint64_t mask = (1ULL << 8) - 1;

There shouldn't be a blank line between declarations. The blank line
should be after the last declaration.

> +    mask |= mask << 16;
> +    mask |= mask << 32;
> +    mask <<= 8;
> +    tcg_gen_movi_i64(t1, 0);
> +
> +    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
> +    tcg_gen_shri_i64(t0, t0, 8);
> +    tcg_gen_or_i64(t1, t1, t0);

If t1 is already initialized to 0, what is the purpose of OR-ing?

> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
> +    tcg_gen_or_i64(t1, t1, t0);
> +
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
> +
> +    tcg_gen_movi_i64(t1, 0);
> +
> +    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shri_i64(t0, t0, 8);
> +    tcg_gen_or_i64(t1, t1, t0);
> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_or_i64(t1, t1, t0);
> +
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
> +
> +    tcg_temp_free_i64(t0);
> +    tcg_temp_free_i64(t1);
> +}
> +
> +static inline void gen_ilvod_h(CPUMIPSState *env, uint32_t wd,
> +                               uint32_t ws, uint32_t wt) {
> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +
> +    uint64_t mask = (1ULL << 16) - 1;
> +    mask |= mask << 32;
> +    mask <<= 16;
> +    tcg_gen_movi_i64(t1, 0);
> +
> +    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
> +    tcg_gen_shri_i64(t0, t0, 16);
> +    tcg_gen_or_i64(t1, t1, t0);
> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
> +    tcg_gen_or_i64(t1, t1, t0);
> +
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
> +
> +    tcg_gen_movi_i64(t1, 0);
> +
> +    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shri_i64(t0, t0, 16);
> +    tcg_gen_or_i64(t1, t1, t0);
> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_or_i64(t1, t1, t0);
> +
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
> +
> +    tcg_temp_free_i64(t0);
> +    tcg_temp_free_i64(t1);
> +}
> +
> +static inline void gen_ilvod_w(CPUMIPSState *env, uint32_t wd,
> +                               uint32_t ws, uint32_t wt) {
> +    TCGv_i64 t0 = tcg_temp_new_i64();
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +
> +    uint64_t mask = (1ULL << 32) - 1;
> +    tcg_gen_movi_i64(t1, 0);
> +
> +    mask <<= 32;
> +    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
> +    tcg_gen_shri_i64(t0, t0, 32);
> +    tcg_gen_or_i64(t1, t1, t0);
> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
> +    tcg_gen_or_i64(t1, t1, t0);
> +
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
> +
> +    tcg_gen_movi_i64(t1, 0);
> +
> +    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
> +    tcg_gen_shri_i64(t0, t0, 32);
> +    tcg_gen_or_i64(t1, t1, t0);
> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
> +    tcg_gen_or_i64(t1, t1, t0);
> +
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
> +
> +    tcg_temp_free_i64(t0);
> +    tcg_temp_free_i64(t1);
> +}
> +
> +static inline void gen_ilvod_d(CPUMIPSState *env, uint32_t wd,
> +                               uint32_t ws, uint32_t wt) {
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2 + 1]);
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]);
> +}
> +
>  static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
>  {
>  #define MASK_MSA_3R(op)    (MASK_MSA_MINOR(op) | (op & (0x7 << 23)))
> @@ -29060,7 +29162,22 @@ static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
>          gen_helper_msa_mod_u_df(cpu_env, tdf, twd, tws, twt);
>          break;
>      case OPC_ILVOD_df:
> -        gen_helper_msa_ilvod_df(cpu_env, tdf, twd, tws, twt);
> +        switch (df) {
> +        case DF_BYTE:
> +            gen_ilvod_b(env, wd, ws, wt);
> +            break;
> +        case DF_HALF:
> +            gen_ilvod_h(env, wd, ws, wt);
> +            break;
> +        case DF_WORD:
> +            gen_ilvod_w(env, wd, ws, wt);
> +            break;
> +        case DF_DOUBLE:
> +            gen_ilvod_d(env, wd, ws, wt);
> +            break;
> +        default:
> +            assert(0);
> +        }
>          break;
> 
>      case OPC_DOTP_S_df:
> --
> 2.7.4
> 
> 

Good idea, but the patches needs improvement.

Also, I noticed you based your patches assuming that your previous patches
are applied. You don't do that, unless your previous patches are accepted
to the upstream tree, and they are not. Please base your patches on the
main upstream tree. Besides, the performance numbers you mentioned in the
commit message should use upstream performance as the reference, not the
performance of the your previous version of related patches.

The same comments as above applies to all versions og ILVEV and ILVOD.

Thanks for these efforts! I expect v2 soon.

Aleksandar
Mateja Marjanovic March 18, 2019, 5:21 p.m. UTC | #3
On 15.3.19. 17:46, Aleksandar Markovic wrote:
>> From: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
>> Subject: [PATCH 1/2] target/mips: Optimize ILVOD.<B|H|W|D> MSA instructions
>>
>> From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>
>>
>> Optimize set of MSA instructions ILVOD, using directly
>> tcg registers and performing logic on them insted of
>> using helpers.
> insted -> instead
Fix in v2.
>> Performance measurement is done by executing the
>> instructions large number of times on a computer
>> with Intel Core i7-3770 CPU @ 3.40GHz×8.
>>
>>   instruction ||    before    ||    after   ||
>> ==============================================
>>   ilvod.b:        ||   66.97 ms   ||  26.34 ms  ||
>>   ilvod.h:        ||   44.75 ms   ||  25.17 ms  ||
>>   ilvod.w:        ||   41.27 ms   ||  24.37 ms  ||
>>   ilvod.d:        ||   41.75 ms   ||  20.50 ms  ||
>>
> Alignment looks wrong.
Fix in v2.
>> Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
>> ---
>>   target/mips/helper.h     |   1 -
>>   target/mips/msa_helper.c |  51 --------------------
>>   target/mips/translate.c  | 119 ++++++++++++++++++++++++++++++++++++++++++++++-
>>   3 files changed, 118 insertions(+), 53 deletions(-)
>>
>> diff --git a/target/mips/helper.h b/target/mips/helper.h
>> index a6d687e..d162836 100644
>> --- a/target/mips/helper.h
>> +++ b/target/mips/helper.h
>> @@ -865,7 +865,6 @@ DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
>>   DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
>>   DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
>>   DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
>> -DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32)
>>   DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
>>   DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
>>   DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
>> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
>> index 9d9dafe..cbcfd57 100644
>> --- a/target/mips/msa_helper.c
>> +++ b/target/mips/msa_helper.c
>> @@ -1363,57 +1363,6 @@ void helper_msa_ilvev_df(CPUMIPSState *env, uint32_t df, uint32_t > wd,
>>       }
>>   }
>>
>> -void helper_msa_ilvod_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
>> -                         uint32_t ws, uint32_t wt)
>> -{
>> -    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
>> -    wr_t *pws = &(env->active_fpu.fpr[ws].wr);
>> -    wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
>> -
>> -    switch (df) {
>> -    case DF_BYTE:
>> -        pwd->b[0]  = pwt->b[1];
>> -        pwd->b[1]  = pws->b[1];
>> -        pwd->b[2]  = pwt->b[3];
>> -        pwd->b[3]  = pws->b[3];
>> -        pwd->b[4]  = pwt->b[5];
>> -        pwd->b[5]  = pws->b[5];
>> -        pwd->b[6]  = pwt->b[7];
>> -        pwd->b[7]  = pws->b[7];
>> -        pwd->b[8]  = pwt->b[9];
>> -        pwd->b[9]  = pws->b[9];
>> -        pwd->b[10] = pwt->b[11];
>> -        pwd->b[11] = pws->b[11];
>> -        pwd->b[12] = pwt->b[13];
>> -        pwd->b[13] = pws->b[13];
>> -        pwd->b[14] = pwt->b[15];
>> -        pwd->b[15] = pws->b[15];
>> -        break;
>> -    case DF_HALF:
>> -        pwd->h[0] = pwt->h[1];
>> -        pwd->h[1] = pws->h[1];
>> -        pwd->h[2] = pwt->h[3];
>> -        pwd->h[3] = pws->h[3];
>> -        pwd->h[4] = pwt->h[5];
>> -        pwd->h[5] = pws->h[5];
>> -        pwd->h[6] = pwt->h[7];
>> -        pwd->h[7] = pws->h[7];
>> -        break;
>> -    case DF_WORD:
>> -        pwd->w[0] = pwt->w[1];
>> -        pwd->w[1] = pws->w[1];
>> -        pwd->w[2] = pwt->w[3];
>> -        pwd->w[3] = pws->w[3];
>> -        break;
>> -    case DF_DOUBLE:
>> -        pwd->d[0] = pwt->d[1];
>> -        pwd->d[1] = pws->d[1];
>> -        break;
>> -    default:
>> -        assert(0);
>> -    }
>> -}
>> -
>>   void helper_msa_ilvl_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
>>                           uint32_t ws, uint32_t wt)
>>   {
>> diff --git a/target/mips/translate.c b/target/mips/translate.c
>> index b4a1103..101d2de 100644
>> --- a/target/mips/translate.c
>> +++ b/target/mips/translate.c
>> @@ -28889,6 +28889,108 @@ static void gen_msa_bit(CPUMIPSState *env, DisasContext *ctx)
>>       tcg_temp_free_i32(tws);
>>   }
>>
>> +static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd,
>> +                               uint32_t ws, uint32_t wt) {
> Please insert a brief comment before each handler, like in this example:
>
> /*
>   * [MSA] ILVOD.B wd, ws, wt
>   *
>   *   Vector Interleave Odd (byte data elements)
>   *
>   */
Will do in v2.
>> +    TCGv_i64 t0 = tcg_temp_new_i64();
>> +    TCGv_i64 t1 = tcg_temp_new_i64();
>> +
>> +    uint64_t mask = (1ULL << 8) - 1;
> There shouldn't be a blank line between declarations. The blank line
> should be after the last declaration.
You are right.
>> +    mask |= mask << 16;
>> +    mask |= mask << 32;
>> +    mask <<= 8;
>> +    tcg_gen_movi_i64(t1, 0);
>> +
>> +    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
>> +    tcg_gen_shri_i64(t0, t0, 8);
>> +    tcg_gen_or_i64(t1, t1, t0);
> If t1 is already initialized to 0, what is the purpose of OR-ing?
>
>> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
>> +    tcg_gen_or_i64(t1, t1, t0);
>> +
>> +    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
>> +
>> +    tcg_gen_movi_i64(t1, 0);
>> +
>> +    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
>> +    tcg_gen_shri_i64(t0, t0, 8);
>> +    tcg_gen_or_i64(t1, t1, t0);
>> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
>> +    tcg_gen_or_i64(t1, t1, t0);
>> +
>> +    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
>> +
>> +    tcg_temp_free_i64(t0);
>> +    tcg_temp_free_i64(t1);
>> +}
>> +
>> +static inline void gen_ilvod_h(CPUMIPSState *env, uint32_t wd,
>> +                               uint32_t ws, uint32_t wt) {
>> +    TCGv_i64 t0 = tcg_temp_new_i64();
>> +    TCGv_i64 t1 = tcg_temp_new_i64();
>> +
>> +    uint64_t mask = (1ULL << 16) - 1;
>> +    mask |= mask << 32;
>> +    mask <<= 16;
>> +    tcg_gen_movi_i64(t1, 0);
>> +
>> +    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
>> +    tcg_gen_shri_i64(t0, t0, 16);
>> +    tcg_gen_or_i64(t1, t1, t0);
>> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
>> +    tcg_gen_or_i64(t1, t1, t0);
>> +
>> +    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
>> +
>> +    tcg_gen_movi_i64(t1, 0);
>> +
>> +    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
>> +    tcg_gen_shri_i64(t0, t0, 16);
>> +    tcg_gen_or_i64(t1, t1, t0);
>> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
>> +    tcg_gen_or_i64(t1, t1, t0);
>> +
>> +    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
>> +
>> +    tcg_temp_free_i64(t0);
>> +    tcg_temp_free_i64(t1);
>> +}
>> +
>> +static inline void gen_ilvod_w(CPUMIPSState *env, uint32_t wd,
>> +                               uint32_t ws, uint32_t wt) {
>> +    TCGv_i64 t0 = tcg_temp_new_i64();
>> +    TCGv_i64 t1 = tcg_temp_new_i64();
>> +
>> +    uint64_t mask = (1ULL << 32) - 1;
>> +    tcg_gen_movi_i64(t1, 0);
>> +
>> +    mask <<= 32;
>> +    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
>> +    tcg_gen_shri_i64(t0, t0, 32);
>> +    tcg_gen_or_i64(t1, t1, t0);
>> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
>> +    tcg_gen_or_i64(t1, t1, t0);
>> +
>> +    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
>> +
>> +    tcg_gen_movi_i64(t1, 0);
>> +
>> +    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
>> +    tcg_gen_shri_i64(t0, t0, 32);
>> +    tcg_gen_or_i64(t1, t1, t0);
>> +    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
>> +    tcg_gen_or_i64(t1, t1, t0);
>> +
>> +    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
>> +
>> +    tcg_temp_free_i64(t0);
>> +    tcg_temp_free_i64(t1);
>> +}
>> +
>> +static inline void gen_ilvod_d(CPUMIPSState *env, uint32_t wd,
>> +                               uint32_t ws, uint32_t wt) {
>> +    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2 + 1]);
>> +    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]);
>> +}
>> +
>>   static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
>>   {
>>   #define MASK_MSA_3R(op)    (MASK_MSA_MINOR(op) | (op & (0x7 << 23)))
>> @@ -29060,7 +29162,22 @@ static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
>>           gen_helper_msa_mod_u_df(cpu_env, tdf, twd, tws, twt);
>>           break;
>>       case OPC_ILVOD_df:
>> -        gen_helper_msa_ilvod_df(cpu_env, tdf, twd, tws, twt);
>> +        switch (df) {
>> +        case DF_BYTE:
>> +            gen_ilvod_b(env, wd, ws, wt);
>> +            break;
>> +        case DF_HALF:
>> +            gen_ilvod_h(env, wd, ws, wt);
>> +            break;
>> +        case DF_WORD:
>> +            gen_ilvod_w(env, wd, ws, wt);
>> +            break;
>> +        case DF_DOUBLE:
>> +            gen_ilvod_d(env, wd, ws, wt);
>> +            break;
>> +        default:
>> +            assert(0);
>> +        }
>>           break;
>>
>>       case OPC_DOTP_S_df:
>> --
>> 2.7.4
>>
>>
> Good idea, but the patches needs improvement.
v2 has much better perfomances than v1, thanks to your advice about 
unnecessary OR-ing and MOV-ing.
> Also, I noticed you based your patches assuming that your previous patches
> are applied. You don't do that, unless your previous patches are accepted
> to the upstream tree, and they are not. Please base your patches on the
> main upstream tree. Besides, the performance numbers you mentioned in the
> commit message should use upstream performance as the reference, not the
> performance of the your previous version of related patches.
>
> The same comments as above applies to all versions og ILVEV and ILVOD.
My mistake.
> Thanks for these efforts! I expect v2 soon.
>
> Aleksandar

Thanks for all the comments.

Mateja
diff mbox series

Patch

==============================================
 ilvod.b:	 ||   66.97 ms   ||  26.34 ms  ||
 ilvod.h:	 ||   44.75 ms   ||  25.17 ms  ||
 ilvod.w:	 ||   41.27 ms   ||  24.37 ms  ||
 ilvod.d:	 ||   41.75 ms   ||  20.50 ms  ||

Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
---
 target/mips/helper.h     |   1 -
 target/mips/msa_helper.c |  51 --------------------
 target/mips/translate.c  | 119 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 118 insertions(+), 53 deletions(-)

diff --git a/target/mips/helper.h b/target/mips/helper.h
index a6d687e..d162836 100644
--- a/target/mips/helper.h
+++ b/target/mips/helper.h
@@ -865,7 +865,6 @@  DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvl_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_ilvev_df, void, env, i32, i32, i32, i32)
-DEF_HELPER_5(msa_ilvod_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
index 9d9dafe..cbcfd57 100644
--- a/target/mips/msa_helper.c
+++ b/target/mips/msa_helper.c
@@ -1363,57 +1363,6 @@  void helper_msa_ilvev_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
     }
 }
 
-void helper_msa_ilvod_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
-                         uint32_t ws, uint32_t wt)
-{
-    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
-    wr_t *pws = &(env->active_fpu.fpr[ws].wr);
-    wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
-
-    switch (df) {
-    case DF_BYTE:
-        pwd->b[0]  = pwt->b[1];
-        pwd->b[1]  = pws->b[1];
-        pwd->b[2]  = pwt->b[3];
-        pwd->b[3]  = pws->b[3];
-        pwd->b[4]  = pwt->b[5];
-        pwd->b[5]  = pws->b[5];
-        pwd->b[6]  = pwt->b[7];
-        pwd->b[7]  = pws->b[7];
-        pwd->b[8]  = pwt->b[9];
-        pwd->b[9]  = pws->b[9];
-        pwd->b[10] = pwt->b[11];
-        pwd->b[11] = pws->b[11];
-        pwd->b[12] = pwt->b[13];
-        pwd->b[13] = pws->b[13];
-        pwd->b[14] = pwt->b[15];
-        pwd->b[15] = pws->b[15];
-        break;
-    case DF_HALF:
-        pwd->h[0] = pwt->h[1];
-        pwd->h[1] = pws->h[1];
-        pwd->h[2] = pwt->h[3];
-        pwd->h[3] = pws->h[3];
-        pwd->h[4] = pwt->h[5];
-        pwd->h[5] = pws->h[5];
-        pwd->h[6] = pwt->h[7];
-        pwd->h[7] = pws->h[7];
-        break;
-    case DF_WORD:
-        pwd->w[0] = pwt->w[1];
-        pwd->w[1] = pws->w[1];
-        pwd->w[2] = pwt->w[3];
-        pwd->w[3] = pws->w[3];
-        break;
-    case DF_DOUBLE:
-        pwd->d[0] = pwt->d[1];
-        pwd->d[1] = pws->d[1];
-        break;
-    default:
-        assert(0);
-    }
-}
-
 void helper_msa_ilvl_df(CPUMIPSState *env, uint32_t df, uint32_t wd,
                         uint32_t ws, uint32_t wt)
 {
diff --git a/target/mips/translate.c b/target/mips/translate.c
index b4a1103..101d2de 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -28889,6 +28889,108 @@  static void gen_msa_bit(CPUMIPSState *env, DisasContext *ctx)
     tcg_temp_free_i32(tws);
 }
 
+static inline void gen_ilvod_b(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    uint64_t mask = (1ULL << 8) - 1;
+    mask |= mask << 16;
+    mask |= mask << 32;
+    mask <<= 8;
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+    tcg_gen_shri_i64(t0, t0, 8);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_shri_i64(t0, t0, 8);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvod_h(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    uint64_t mask = (1ULL << 16) - 1;
+    mask |= mask << 32;
+    mask <<= 16;
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+    tcg_gen_shri_i64(t0, t0, 16);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_shri_i64(t0, t0, 16);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvod_w(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    TCGv_i64 t0 = tcg_temp_new_i64();
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
+    uint64_t mask = (1ULL << 32) - 1;
+    tcg_gen_movi_i64(t1, 0);
+
+    mask <<= 32;
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2], mask);
+    tcg_gen_shri_i64(t0, t0, 32);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], t1);
+
+    tcg_gen_movi_i64(t1, 0);
+
+    tcg_gen_andi_i64(t0, msa_wr_d[wt * 2 + 1], mask);
+    tcg_gen_shri_i64(t0, t0, 32);
+    tcg_gen_or_i64(t1, t1, t0);
+    tcg_gen_andi_i64(t0, msa_wr_d[ws * 2 + 1], mask);
+    tcg_gen_or_i64(t1, t1, t0);
+
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], t1);
+
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
+static inline void gen_ilvod_d(CPUMIPSState *env, uint32_t wd,
+                               uint32_t ws, uint32_t wt) {
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2 + 1]);
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2 + 1]);
+}
+
 static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
 {
 #define MASK_MSA_3R(op)    (MASK_MSA_MINOR(op) | (op & (0x7 << 23)))
@@ -29060,7 +29162,22 @@  static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
         gen_helper_msa_mod_u_df(cpu_env, tdf, twd, tws, twt);
         break;
     case OPC_ILVOD_df:
-        gen_helper_msa_ilvod_df(cpu_env, tdf, twd, tws, twt);
+        switch (df) {
+        case DF_BYTE:
+            gen_ilvod_b(env, wd, ws, wt);
+            break;
+        case DF_HALF:
+            gen_ilvod_h(env, wd, ws, wt);
+            break;
+        case DF_WORD:
+            gen_ilvod_w(env, wd, ws, wt);
+            break;
+        case DF_DOUBLE:
+            gen_ilvod_d(env, wd, ws, wt);
+            break;
+        default:
+            assert(0);
+        }
         break;
 
     case OPC_DOTP_S_df: