diff mbox series

[v6,4/4] target/mips: Optimize ILVR.<B|H|W|D> MSA instructions

Message ID 1554383690-28338-5-git-send-email-mateja.marjanovic@rt-rk.com (mailing list archive)
State New, archived
Headers show
Series target/mips: Optimize MSA interleave instructions | expand

Commit Message

Mateja Marjanovic April 4, 2019, 1:14 p.m. UTC
From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>

Optimized ILVR.<B|H|W|D> instructions, using a hybrid
approach. For byte data elements, use a helper with an
unrolled loop (much better performance), for halfword,
word and doubleword data elements use directly tcg
registers and logic performed on them.

Performance measurement is done by executing the
instructions a large number of times on a computer
with Intel Core i7-3770 CPU @ 3.40GHz×8.

Comments

Aleksandar Markovic April 13, 2019, 4:05 p.m. UTC | #1
On Thu, Apr 4, 2019 at 3:16 PM Mateja Marjanovic
<mateja.marjanovic@rt-rk.com> wrote:
>
> From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>
>
> Optimized ILVR.<B|H|W|D> instructions, using a hybrid

Optimized -> Optimize

> approach. For byte data elements, use a helper with an
> unrolled loop (much better performance), for halfword,

(much better performance) -> (having much better performance
than direct tcg translation)

> word and doubleword data elements use directly tcg
> registers and logic performed on them.
>
> Performance measurement is done by executing the
> instructions a large number of times on a computer
> with Intel Core i7-3770 CPU @ 3.40GHz×8.
>
> ===================================================
> ||  instr  ||  helper  ||    tcg    ||   hybrid  ||
> ===================================================
> || ilvr.b: || 62.87 ms ||  74.76 ms ||  61.52 ms || <-- helper
> || ilvr.h: || 44.11 ms ||  33.00 ms ||  33.55 ms || <-- tcg
> || ilvr.w: || 34.97 ms ||  23.06 ms ||  22.67 ms || <-- tcg
> || ilvr.d: || 27.33 ms ||  19.87 ms ||  20.02 ms || <-- tcg
> ===================================================
>

instr -> instruction

||  61.52 ms || <-- helper  ->  ||  61.52 ms (helper) ||

and similar for other three raws.

> Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
> ---
>  target/mips/helper.h     |   2 +-
>  target/mips/msa_helper.c |  33 +++++++++++----
>  target/mips/translate.c  | 107 ++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 132 insertions(+), 10 deletions(-)
>
> diff --git a/target/mips/helper.h b/target/mips/helper.h
> index cd73723..d4755ef 100644
> --- a/target/mips/helper.h
> +++ b/target/mips/helper.h
> @@ -862,7 +862,6 @@ DEF_HELPER_5(msa_sld_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_splat_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_pckev_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
> -DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
>  DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
> @@ -946,6 +945,7 @@ DEF_HELPER_4(msa_insert_w, void, env, i32, i32, i32)
>  DEF_HELPER_4(msa_insert_d, void, env, i32, i32, i32)
>
>  DEF_HELPER_4(msa_ilvl_b, void, env, i32, i32, i32)
> +DEF_HELPER_4(msa_ilvr_b, void, env, i32, i32, i32)
>
>  DEF_HELPER_4(msa_fclass_df, void, env, i32, i32, i32)
>  DEF_HELPER_4(msa_ftrunc_s_df, void, env, i32, i32, i32)
> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
> index 84bbe6f..2470cef 100644
> --- a/target/mips/msa_helper.c
> +++ b/target/mips/msa_helper.c
> @@ -1181,14 +1181,6 @@ MSA_FN_DF(pckev_df)
>      } while (0)
>  MSA_FN_DF(pckod_df)
>  #undef MSA_DO
> -
> -#define MSA_DO(DF)                      \
> -    do {                                \
> -        pwx->DF[2*i]   = R##DF(pwt, i); \
> -        pwx->DF[2*i+1] = R##DF(pws, i); \
> -    } while (0)
> -MSA_FN_DF(ilvr_df)
> -#undef MSA_DO
>  #undef MSA_LOOP_COND
>
>  #define MSA_LOOP_COND(DF) \
> @@ -1249,6 +1241,31 @@ void helper_msa_ilvl_b(CPUMIPSState *env, uint32_t wd,
>      pwd->b[15] = pws->b[15];
>  }
>
> +void helper_msa_ilvr_b(CPUMIPSState *env, uint32_t wd,
> +                       uint32_t ws, uint32_t wt)
> +{
> +    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
> +    wr_t *pws = &(env->active_fpu.fpr[ws].wr);
> +    wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
> +

Why do we use here env->active_fpu.fpr[wd].wr, while for other instructions in
this patch, we access msa_wr_d<b|h|w|d[] directly?

> +    pwd->b[15] = pws->b[7];
> +    pwd->b[14] = pwt->b[7];
> +    pwd->b[13] = pws->b[6];
> +    pwd->b[12] = pwt->b[6];
> +    pwd->b[11] = pws->b[5];
> +    pwd->b[10] = pwt->b[5];
> +    pwd->b[9]  = pws->b[4];
> +    pwd->b[8]  = pwt->b[4];
> +    pwd->b[7]  = pws->b[3];
> +    pwd->b[6]  = pwt->b[3];
> +    pwd->b[5]  = pws->b[2];
> +    pwd->b[4]  = pwt->b[2];
> +    pwd->b[3]  = pws->b[1];
> +    pwd->b[2]  = pwt->b[1];
> +    pwd->b[1]  = pws->b[0];
> +    pwd->b[0]  = pwt->b[0];
> +}
> +
>  void helper_msa_copy_s_b(CPUMIPSState *env, uint32_t rd,
>                           uint32_t ws, uint32_t n)
>  {
> diff --git a/target/mips/translate.c b/target/mips/translate.c
> index 6c6811e..90332fb 100644
> --- a/target/mips/translate.c
> +++ b/target/mips/translate.c
> @@ -28885,6 +28885,96 @@ static void gen_msa_bit(CPUMIPSState *env, DisasContext *ctx)
>  }
>
>  /*
> + * [MSA] ILVR.H wd, ws, wt
> + *
> + *   Vector Interleave Right (halfword data elements)
> + *
> + */
> +static inline void gen_ilvr_h(CPUMIPSState *env, uint32_t wd,
> +                              uint32_t ws, uint32_t wt)
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +    uint64_t mask = 0x000000000000ffffULL;
> +
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> +    tcg_gen_mov_i64(t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
> +    tcg_gen_shli_i64(t1, t1, 16);
> +    tcg_gen_or_i64(t2, t2, t1);
> +
> +    mask <<= 16;
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> +    tcg_gen_shli_i64(t1, t1, 16);
> +    tcg_gen_or_i64(t2, t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
> +    tcg_gen_shli_i64(t1, t1, 32);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2], t2, t1);
> +
> +    mask <<= 16;
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> +    tcg_gen_shri_i64(t1, t1, 32);
> +    tcg_gen_mov_i64(t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
> +    tcg_gen_shri_i64(t1, t1, 16);
> +    tcg_gen_or_i64(t2, t2, t1);
> +
> +    mask <<= 16;
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> +    tcg_gen_shri_i64(t1, t1, 16);
> +    tcg_gen_or_i64(t2, t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t2, t1);
> +
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +}
> +
> +/*
> + * [MSA] ILVR.W wd, ws, wt
> + *
> + *   Vector Interleave Right (word data elements)
> + *
> + */
> +static inline void gen_ilvr_w(CPUMIPSState *env, uint32_t wd,
> +                              uint32_t ws, uint32_t wt)
> +{
> +    TCGv_i64 t1 = tcg_temp_new_i64();
> +    TCGv_i64 t2 = tcg_temp_new_i64();
> +    uint64_t mask = 0x00000000ffffffffULL;

Use tcg_const_i64(). The same for the previous function.

> +
> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> +    tcg_gen_mov_i64(t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
> +    tcg_gen_shli_i64(t1, t1, 32);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2], t2, t1);
> +
> +    mask <<= 32;

Just assign the constant value to the mask, no need for shift operation.
The same applies for other similar cases in this patch.

> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
> +    tcg_gen_shri_i64(t1, t1, 32);
> +    tcg_gen_mov_i64(t2, t1);
> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
> +    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t2, t1);
> +
> +    tcg_temp_free_i64(t1);
> +    tcg_temp_free_i64(t2);
> +}
> +
> +/*
> + * [MSA] ILVR.D wd, ws, wt
> + *
> + *   Vector Interleave Right (doubleword data elements)
> + *
> + */
> +static inline void gen_ilvr_d(CPUMIPSState *env, uint32_t wd,
> +                              uint32_t ws, uint32_t wt)
> +{
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2]);
> +    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2]);
> +}
> +

This function seems to be identical to the gen_ilvev_d(). Please,
if that is the case, in this patch rename gen_ilvev_d() to gen_ilvev_ilvr_d(),
and  use it both for hanlding ILVEV.D and ILVR.D.

> +
> +/*
>   * [MSA] ILVL.B wd, ws, wt
>   *
>   *   Vector Interleave Left (byte data elements)
> @@ -29380,7 +29470,22 @@ static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
>          gen_helper_msa_div_u_df(cpu_env, tdf, twd, tws, twt);
>          break;
>      case OPC_ILVR_df:
> -        gen_helper_msa_ilvr_df(cpu_env, tdf, twd, tws, twt);
> +        switch (df) {
> +        case DF_BYTE:
> +            gen_helper_msa_ilvr_b(cpu_env, twd, tws, twt);
> +            break;
> +        case DF_HALF:
> +            gen_ilvr_h(env, wd, ws, wt);
> +            break;
> +        case DF_WORD:
> +            gen_ilvr_w(env, wd, ws, wt);
> +            break;
> +        case DF_DOUBLE:
> +            gen_ilvr_d(env, wd, ws, wt);
> +            break;
> +        default:
> +            assert(0);
> +        }
>          break;
>      case OPC_BINSL_df:
>          gen_helper_msa_binsl_df(cpu_env, tdf, twd, tws, twt);
> --
> 2.7.4
>
>

Thanks,
Aleksandar
Mateja Marjanovic April 15, 2019, 11:24 a.m. UTC | #2
On 13.4.19. 18:05, Aleksandar Markovic wrote:
> On Thu, Apr 4, 2019 at 3:16 PM Mateja Marjanovic
> <mateja.marjanovic@rt-rk.com> wrote:
>> From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>
>>
>> Optimized ILVR.<B|H|W|D> instructions, using a hybrid
> Optimized -> Optimize
>
>> approach. For byte data elements, use a helper with an
>> unrolled loop (much better performance), for halfword,
> (much better performance) -> (having much better performance
> than direct tcg translation)
>
>> word and doubleword data elements use directly tcg
>> registers and logic performed on them.
>>
>> Performance measurement is done by executing the
>> instructions a large number of times on a computer
>> with Intel Core i7-3770 CPU @ 3.40GHz×8.
>>
>> ===================================================
>> ||  instr  ||  helper  ||    tcg    ||   hybrid  ||
>> ===================================================
>> || ilvr.b: || 62.87 ms ||  74.76 ms ||  61.52 ms || <-- helper
>> || ilvr.h: || 44.11 ms ||  33.00 ms ||  33.55 ms || <-- tcg
>> || ilvr.w: || 34.97 ms ||  23.06 ms ||  22.67 ms || <-- tcg
>> || ilvr.d: || 27.33 ms ||  19.87 ms ||  20.02 ms || <-- tcg
>> ===================================================
>>
> instr -> instruction
>
> ||  61.52 ms || <-- helper  ->  ||  61.52 ms (helper) ||
>
> and similar for other three raws.
I will change those three in v7.
>
>> Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
>> ---
>>   target/mips/helper.h     |   2 +-
>>   target/mips/msa_helper.c |  33 +++++++++++----
>>   target/mips/translate.c  | 107 ++++++++++++++++++++++++++++++++++++++++++++++-
>>   3 files changed, 132 insertions(+), 10 deletions(-)
>>
>> diff --git a/target/mips/helper.h b/target/mips/helper.h
>> index cd73723..d4755ef 100644
>> --- a/target/mips/helper.h
>> +++ b/target/mips/helper.h
>> @@ -862,7 +862,6 @@ DEF_HELPER_5(msa_sld_df, void, env, i32, i32, i32, i32)
>>   DEF_HELPER_5(msa_splat_df, void, env, i32, i32, i32, i32)
>>   DEF_HELPER_5(msa_pckev_df, void, env, i32, i32, i32, i32)
>>   DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
>> -DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
>>   DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
>>   DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
>>   DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
>> @@ -946,6 +945,7 @@ DEF_HELPER_4(msa_insert_w, void, env, i32, i32, i32)
>>   DEF_HELPER_4(msa_insert_d, void, env, i32, i32, i32)
>>
>>   DEF_HELPER_4(msa_ilvl_b, void, env, i32, i32, i32)
>> +DEF_HELPER_4(msa_ilvr_b, void, env, i32, i32, i32)
>>
>>   DEF_HELPER_4(msa_fclass_df, void, env, i32, i32, i32)
>>   DEF_HELPER_4(msa_ftrunc_s_df, void, env, i32, i32, i32)
>> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
>> index 84bbe6f..2470cef 100644
>> --- a/target/mips/msa_helper.c
>> +++ b/target/mips/msa_helper.c
>> @@ -1181,14 +1181,6 @@ MSA_FN_DF(pckev_df)
>>       } while (0)
>>   MSA_FN_DF(pckod_df)
>>   #undef MSA_DO
>> -
>> -#define MSA_DO(DF)                      \
>> -    do {                                \
>> -        pwx->DF[2*i]   = R##DF(pwt, i); \
>> -        pwx->DF[2*i+1] = R##DF(pws, i); \
>> -    } while (0)
>> -MSA_FN_DF(ilvr_df)
>> -#undef MSA_DO
>>   #undef MSA_LOOP_COND
>>
>>   #define MSA_LOOP_COND(DF) \
>> @@ -1249,6 +1241,31 @@ void helper_msa_ilvl_b(CPUMIPSState *env, uint32_t wd,
>>       pwd->b[15] = pws->b[15];
>>   }
>>
>> +void helper_msa_ilvr_b(CPUMIPSState *env, uint32_t wd,
>> +                       uint32_t ws, uint32_t wt)
>> +{
>> +    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
>> +    wr_t *pws = &(env->active_fpu.fpr[ws].wr);
>> +    wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
>> +
> Why do we use here env->active_fpu.fpr[wd].wr, while for other instructions in
> this patch, we access msa_wr_d<b|h|w|d[] directly?
With a pointer to wr_t we have an array of bytes, halfwords, words or 
doublewords
and can read from them and change them like an ordinary array. In other 
cases
we use a variable that is TCGv_i64 and would have to use tcg_gen 
functions to
modify the value of the register. Before my changes in ilvr instruction 
helpers
env->active_fpu.fpr[wd].wr was used, so I just copy-pasted that.
>
>> +    pwd->b[15] = pws->b[7];
>> +    pwd->b[14] = pwt->b[7];
>> +    pwd->b[13] = pws->b[6];
>> +    pwd->b[12] = pwt->b[6];
>> +    pwd->b[11] = pws->b[5];
>> +    pwd->b[10] = pwt->b[5];
>> +    pwd->b[9]  = pws->b[4];
>> +    pwd->b[8]  = pwt->b[4];
>> +    pwd->b[7]  = pws->b[3];
>> +    pwd->b[6]  = pwt->b[3];
>> +    pwd->b[5]  = pws->b[2];
>> +    pwd->b[4]  = pwt->b[2];
>> +    pwd->b[3]  = pws->b[1];
>> +    pwd->b[2]  = pwt->b[1];
>> +    pwd->b[1]  = pws->b[0];
>> +    pwd->b[0]  = pwt->b[0];
>> +}
>> +
>>   void helper_msa_copy_s_b(CPUMIPSState *env, uint32_t rd,
>>                            uint32_t ws, uint32_t n)
>>   {
>> diff --git a/target/mips/translate.c b/target/mips/translate.c
>> index 6c6811e..90332fb 100644
>> --- a/target/mips/translate.c
>> +++ b/target/mips/translate.c
>> @@ -28885,6 +28885,96 @@ static void gen_msa_bit(CPUMIPSState *env, DisasContext *ctx)
>>   }
>>
>>   /*
>> + * [MSA] ILVR.H wd, ws, wt
>> + *
>> + *   Vector Interleave Right (halfword data elements)
>> + *
>> + */
>> +static inline void gen_ilvr_h(CPUMIPSState *env, uint32_t wd,
>> +                              uint32_t ws, uint32_t wt)
>> +{
>> +    TCGv_i64 t1 = tcg_temp_new_i64();
>> +    TCGv_i64 t2 = tcg_temp_new_i64();
>> +    uint64_t mask = 0x000000000000ffffULL;
>> +
>> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
>> +    tcg_gen_mov_i64(t2, t1);
>> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
>> +    tcg_gen_shli_i64(t1, t1, 16);
>> +    tcg_gen_or_i64(t2, t2, t1);
>> +
>> +    mask <<= 16;
>> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
>> +    tcg_gen_shli_i64(t1, t1, 16);
>> +    tcg_gen_or_i64(t2, t2, t1);
>> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
>> +    tcg_gen_shli_i64(t1, t1, 32);
>> +    tcg_gen_or_i64(msa_wr_d[wd * 2], t2, t1);
>> +
>> +    mask <<= 16;
>> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
>> +    tcg_gen_shri_i64(t1, t1, 32);
>> +    tcg_gen_mov_i64(t2, t1);
>> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
>> +    tcg_gen_shri_i64(t1, t1, 16);
>> +    tcg_gen_or_i64(t2, t2, t1);
>> +
>> +    mask <<= 16;
>> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
>> +    tcg_gen_shri_i64(t1, t1, 16);
>> +    tcg_gen_or_i64(t2, t2, t1);
>> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
>> +    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t2, t1);
>> +
>> +    tcg_temp_free_i64(t1);
>> +    tcg_temp_free_i64(t2);
>> +}
>> +
>> +/*
>> + * [MSA] ILVR.W wd, ws, wt
>> + *
>> + *   Vector Interleave Right (word data elements)
>> + *
>> + */
>> +static inline void gen_ilvr_w(CPUMIPSState *env, uint32_t wd,
>> +                              uint32_t ws, uint32_t wt)
>> +{
>> +    TCGv_i64 t1 = tcg_temp_new_i64();
>> +    TCGv_i64 t2 = tcg_temp_new_i64();
>> +    uint64_t mask = 0x00000000ffffffffULL;
> Use tcg_const_i64(). The same for the previous function.
Will do in v7.
>
>> +
>> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
>> +    tcg_gen_mov_i64(t2, t1);
>> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
>> +    tcg_gen_shli_i64(t1, t1, 32);
>> +    tcg_gen_or_i64(msa_wr_d[wd * 2], t2, t1);
>> +
>> +    mask <<= 32;
> Just assign the constant value to the mask, no need for shift operation.
> The same applies for other similar cases in this patch.
I was not sure which would have better performance, so I assumed
this with shifting, but I will add with assigning a constant to a register,
and test the performance.
>> +    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
>> +    tcg_gen_shri_i64(t1, t1, 32);
>> +    tcg_gen_mov_i64(t2, t1);
>> +    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
>> +    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t2, t1);
>> +
>> +    tcg_temp_free_i64(t1);
>> +    tcg_temp_free_i64(t2);
>> +}
>> +
>> +/*
>> + * [MSA] ILVR.D wd, ws, wt
>> + *
>> + *   Vector Interleave Right (doubleword data elements)
>> + *
>> + */
>> +static inline void gen_ilvr_d(CPUMIPSState *env, uint32_t wd,
>> +                              uint32_t ws, uint32_t wt)
>> +{
>> +    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2]);
>> +    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2]);
>> +}
>> +
> This function seems to be identical to the gen_ilvev_d(). Please,
> if that is the case, in this patch rename gen_ilvev_d() to gen_ilvev_ilvr_d(),
> and  use it both for hanlding ILVEV.D and ILVR.D.
I didn't notice that. I will check, and if you are right, I will do that 
in v7.
>> +
>> +/*
>>    * [MSA] ILVL.B wd, ws, wt
>>    *
>>    *   Vector Interleave Left (byte data elements)
>> @@ -29380,7 +29470,22 @@ static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
>>           gen_helper_msa_div_u_df(cpu_env, tdf, twd, tws, twt);
>>           break;
>>       case OPC_ILVR_df:
>> -        gen_helper_msa_ilvr_df(cpu_env, tdf, twd, tws, twt);
>> +        switch (df) {
>> +        case DF_BYTE:
>> +            gen_helper_msa_ilvr_b(cpu_env, twd, tws, twt);
>> +            break;
>> +        case DF_HALF:
>> +            gen_ilvr_h(env, wd, ws, wt);
>> +            break;
>> +        case DF_WORD:
>> +            gen_ilvr_w(env, wd, ws, wt);
>> +            break;
>> +        case DF_DOUBLE:
>> +            gen_ilvr_d(env, wd, ws, wt);
>> +            break;
>> +        default:
>> +            assert(0);
>> +        }
>>           break;
>>       case OPC_BINSL_df:
>>           gen_helper_msa_binsl_df(cpu_env, tdf, twd, tws, twt);
>> --
>> 2.7.4
>>
>>
> Thanks,
> Aleksandar
Thanks,
Mateja
Aleksandar Markovic April 16, 2019, 9:20 p.m. UTC | #3
> From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>
> >>
> >> +void helper_msa_ilvr_b(CPUMIPSState *env, uint32_t wd,
> >> +                       uint32_t ws, uint32_t wt)
> >> +{
> >> +    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
> >> +    wr_t *pws = &(env->active_fpu.fpr[ws].wr);
> >> +    wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
> >> +
> > Why do we use here env->active_fpu.fpr[wd].wr, while for other instructions in
> > this patch, we access msa_wr_d<b|h|w|d[] directly?
> With a pointer to wr_t we have an array of bytes, halfwords, words or
> doublewords
> and can read from them and change them like an ordinary array. In other
> cases
> we use a variable that is TCGv_i64 and would have to use tcg_gen
> functions to
> modify the value of the register. Before my changes in ilvr instruction
> helpers
> env->active_fpu.fpr[wd].wr was used, so I just copy-pasted that.
>

Your answer touches just surface, and doesn't fully answer my question.
I would like you to show deeper understanding of the code you are working
with. You can't just copy/paste without thinking.

Why do majority of MSA helpers use env->active_fpu.fpr[<index>].wr, while
your code mostly reference the MSA register directly? Is this the same
thing? If yes, why all MSA code doesn't use registers directly, which
would certainly be simpler than referencing active_fpu? What is the role
of "active_fpu"? Can it be changed? Can you analyze the underlying
reasons for referencing "active_fpu", and can you claim that it is safe
to circumvent it and reference the MSA registers directly?

Thanks,
Aleksandar
Mateja Marjanovic April 17, 2019, 8:16 a.m. UTC | #4
On 16.4.19. 23:20, Aleksandar Markovic wrote:
>> From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>
>>>> +void helper_msa_ilvr_b(CPUMIPSState *env, uint32_t wd,
>>>> +                       uint32_t ws, uint32_t wt)
>>>> +{
>>>> +    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
>>>> +    wr_t *pws = &(env->active_fpu.fpr[ws].wr);
>>>> +    wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
>>>> +
>>> Why do we use here env->active_fpu.fpr[wd].wr, while for other instructions in
>>> this patch, we access msa_wr_d<b|h|w|d[] directly?
>> With a pointer to wr_t we have an array of bytes, halfwords, words or
>> doublewords
>> and can read from them and change them like an ordinary array. In other
>> cases
>> we use a variable that is TCGv_i64 and would have to use tcg_gen
>> functions to
>> modify the value of the register. Before my changes in ilvr instruction
>> helpers
>> env->active_fpu.fpr[wd].wr was used, so I just copy-pasted that.
>>
> Your answer touches just surface, and doesn't fully answer my question.
> I would like you to show deeper understanding of the code you are working
> with. You can't just copy/paste without thinking.
>
> Why do majority of MSA helpers use env->active_fpu.fpr[<index>].wr, while
> your code mostly reference the MSA register directly? Is this the same
> thing? If yes, why all MSA code doesn't use registers directly, which
> would certainly be simpler than referencing active_fpu? What is the role
> of "active_fpu"? Can it be changed? Can you analyze the underlying
> reasons for referencing "active_fpu", and can you claim that it is safe
> to circumvent it and reference the MSA registers directly?
I will look into that, and try to analyze it and understand it.
Thanks,
Mateja
> Thanks,
> Aleksandar
diff mbox series

Patch

===================================================
||  instr  ||  helper  ||    tcg    ||   hybrid  ||
===================================================
|| ilvr.b: || 62.87 ms ||  74.76 ms ||  61.52 ms || <-- helper
|| ilvr.h: || 44.11 ms ||  33.00 ms ||  33.55 ms || <-- tcg
|| ilvr.w: || 34.97 ms ||  23.06 ms ||  22.67 ms || <-- tcg
|| ilvr.d: || 27.33 ms ||  19.87 ms ||  20.02 ms || <-- tcg
===================================================

Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
---
 target/mips/helper.h     |   2 +-
 target/mips/msa_helper.c |  33 +++++++++++----
 target/mips/translate.c  | 107 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 132 insertions(+), 10 deletions(-)

diff --git a/target/mips/helper.h b/target/mips/helper.h
index cd73723..d4755ef 100644
--- a/target/mips/helper.h
+++ b/target/mips/helper.h
@@ -862,7 +862,6 @@  DEF_HELPER_5(msa_sld_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_splat_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_pckev_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_pckod_df, void, env, i32, i32, i32, i32)
-DEF_HELPER_5(msa_ilvr_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_vshf_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srar_df, void, env, i32, i32, i32, i32)
 DEF_HELPER_5(msa_srlr_df, void, env, i32, i32, i32, i32)
@@ -946,6 +945,7 @@  DEF_HELPER_4(msa_insert_w, void, env, i32, i32, i32)
 DEF_HELPER_4(msa_insert_d, void, env, i32, i32, i32)
 
 DEF_HELPER_4(msa_ilvl_b, void, env, i32, i32, i32)
+DEF_HELPER_4(msa_ilvr_b, void, env, i32, i32, i32)
 
 DEF_HELPER_4(msa_fclass_df, void, env, i32, i32, i32)
 DEF_HELPER_4(msa_ftrunc_s_df, void, env, i32, i32, i32)
diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
index 84bbe6f..2470cef 100644
--- a/target/mips/msa_helper.c
+++ b/target/mips/msa_helper.c
@@ -1181,14 +1181,6 @@  MSA_FN_DF(pckev_df)
     } while (0)
 MSA_FN_DF(pckod_df)
 #undef MSA_DO
-
-#define MSA_DO(DF)                      \
-    do {                                \
-        pwx->DF[2*i]   = R##DF(pwt, i); \
-        pwx->DF[2*i+1] = R##DF(pws, i); \
-    } while (0)
-MSA_FN_DF(ilvr_df)
-#undef MSA_DO
 #undef MSA_LOOP_COND
 
 #define MSA_LOOP_COND(DF) \
@@ -1249,6 +1241,31 @@  void helper_msa_ilvl_b(CPUMIPSState *env, uint32_t wd,
     pwd->b[15] = pws->b[15];
 }
 
+void helper_msa_ilvr_b(CPUMIPSState *env, uint32_t wd,
+                       uint32_t ws, uint32_t wt)
+{
+    wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
+    wr_t *pws = &(env->active_fpu.fpr[ws].wr);
+    wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
+
+    pwd->b[15] = pws->b[7];
+    pwd->b[14] = pwt->b[7];
+    pwd->b[13] = pws->b[6];
+    pwd->b[12] = pwt->b[6];
+    pwd->b[11] = pws->b[5];
+    pwd->b[10] = pwt->b[5];
+    pwd->b[9]  = pws->b[4];
+    pwd->b[8]  = pwt->b[4];
+    pwd->b[7]  = pws->b[3];
+    pwd->b[6]  = pwt->b[3];
+    pwd->b[5]  = pws->b[2];
+    pwd->b[4]  = pwt->b[2];
+    pwd->b[3]  = pws->b[1];
+    pwd->b[2]  = pwt->b[1];
+    pwd->b[1]  = pws->b[0];
+    pwd->b[0]  = pwt->b[0];
+}
+
 void helper_msa_copy_s_b(CPUMIPSState *env, uint32_t rd,
                          uint32_t ws, uint32_t n)
 {
diff --git a/target/mips/translate.c b/target/mips/translate.c
index 6c6811e..90332fb 100644
--- a/target/mips/translate.c
+++ b/target/mips/translate.c
@@ -28885,6 +28885,96 @@  static void gen_msa_bit(CPUMIPSState *env, DisasContext *ctx)
 }
 
 /*
+ * [MSA] ILVR.H wd, ws, wt
+ *
+ *   Vector Interleave Right (halfword data elements)
+ *
+ */
+static inline void gen_ilvr_h(CPUMIPSState *env, uint32_t wd,
+                              uint32_t ws, uint32_t wt)
+{
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i64 t2 = tcg_temp_new_i64();
+    uint64_t mask = 0x000000000000ffffULL;
+
+    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+    tcg_gen_mov_i64(t2, t1);
+    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+    tcg_gen_shli_i64(t1, t1, 16);
+    tcg_gen_or_i64(t2, t2, t1);
+
+    mask <<= 16;
+    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+    tcg_gen_shli_i64(t1, t1, 16);
+    tcg_gen_or_i64(t2, t2, t1);
+    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+    tcg_gen_shli_i64(t1, t1, 32);
+    tcg_gen_or_i64(msa_wr_d[wd * 2], t2, t1);
+
+    mask <<= 16;
+    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+    tcg_gen_shri_i64(t1, t1, 32);
+    tcg_gen_mov_i64(t2, t1);
+    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+    tcg_gen_shri_i64(t1, t1, 16);
+    tcg_gen_or_i64(t2, t2, t1);
+
+    mask <<= 16;
+    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+    tcg_gen_shri_i64(t1, t1, 16);
+    tcg_gen_or_i64(t2, t2, t1);
+    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t2, t1);
+
+    tcg_temp_free_i64(t1);
+    tcg_temp_free_i64(t2);
+}
+
+/*
+ * [MSA] ILVR.W wd, ws, wt
+ *
+ *   Vector Interleave Right (word data elements)
+ *
+ */
+static inline void gen_ilvr_w(CPUMIPSState *env, uint32_t wd,
+                              uint32_t ws, uint32_t wt)
+{
+    TCGv_i64 t1 = tcg_temp_new_i64();
+    TCGv_i64 t2 = tcg_temp_new_i64();
+    uint64_t mask = 0x00000000ffffffffULL;
+
+    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+    tcg_gen_mov_i64(t2, t1);
+    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+    tcg_gen_shli_i64(t1, t1, 32);
+    tcg_gen_or_i64(msa_wr_d[wd * 2], t2, t1);
+
+    mask <<= 32;
+    tcg_gen_andi_i64(t1, msa_wr_d[wt * 2], mask);
+    tcg_gen_shri_i64(t1, t1, 32);
+    tcg_gen_mov_i64(t2, t1);
+    tcg_gen_andi_i64(t1, msa_wr_d[ws * 2], mask);
+    tcg_gen_or_i64(msa_wr_d[wd * 2 + 1], t2, t1);
+
+    tcg_temp_free_i64(t1);
+    tcg_temp_free_i64(t2);
+}
+
+/*
+ * [MSA] ILVR.D wd, ws, wt
+ *
+ *   Vector Interleave Right (doubleword data elements)
+ *
+ */
+static inline void gen_ilvr_d(CPUMIPSState *env, uint32_t wd,
+                              uint32_t ws, uint32_t wt)
+{
+    tcg_gen_mov_i64(msa_wr_d[wd * 2 + 1], msa_wr_d[ws * 2]);
+    tcg_gen_mov_i64(msa_wr_d[wd * 2], msa_wr_d[wt * 2]);
+}
+
+
+/*
  * [MSA] ILVL.B wd, ws, wt
  *
  *   Vector Interleave Left (byte data elements)
@@ -29380,7 +29470,22 @@  static void gen_msa_3r(CPUMIPSState *env, DisasContext *ctx)
         gen_helper_msa_div_u_df(cpu_env, tdf, twd, tws, twt);
         break;
     case OPC_ILVR_df:
-        gen_helper_msa_ilvr_df(cpu_env, tdf, twd, tws, twt);
+        switch (df) {
+        case DF_BYTE:
+            gen_helper_msa_ilvr_b(cpu_env, twd, tws, twt);
+            break;
+        case DF_HALF:
+            gen_ilvr_h(env, wd, ws, wt);
+            break;
+        case DF_WORD:
+            gen_ilvr_w(env, wd, ws, wt);
+            break;
+        case DF_DOUBLE:
+            gen_ilvr_d(env, wd, ws, wt);
+            break;
+        default:
+            assert(0);
+        }
         break;
     case OPC_BINSL_df:
         gen_helper_msa_binsl_df(cpu_env, tdf, twd, tws, twt);