diff mbox series

[1/2] target/mips: Improve performance for MSA binary operations

Message ID 1551718283-4487-2-git-send-email-mateja.marjanovic@rt-rk.com (mailing list archive)
State New, archived
Headers show
Series target/mips: Improve performance for MSA binary operations | expand

Commit Message

Mateja Marjanovic March 4, 2019, 4:51 p.m. UTC
From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>

Eliminate loops for better performance.

Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
---
 target/mips/msa_helper.c | 43 ++++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

Comments

Aleksandar Markovic June 1, 2019, 2:16 p.m. UTC | #1
> From: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
> Sent: Monday, March 4, 2019 5:51 PM
> To: qemu-devel@nongnu.org
> Cc: aurelien@aurel32.net; Aleksandar Markovic; Aleksandar Rikalo
> Subject: [PATCH 1/2] target/mips: Improve performance for MSA binary operations
> 
> From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>
> 
> Eliminate loops for better performance.
> 
> Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
> ---
>  target/mips/msa_helper.c | 43 ++++++++++++++++++++++++++++++-------------
>  1 file changed, 30 insertions(+), 13 deletions(-)
> 

The commit message should be a little bit more informative - for example,
it could list the affected instructions. Please consider other groups of
MSA instructions that are implemented via helpers that use similar "for"
loops. Otherwise:

Reviewed-by: Aleksandar Markovic <amarkovic@wavecomp.com>

> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
> index 4c7ec05..1152fda 100644
> --- a/target/mips/msa_helper.c
> +++ b/target/mips/msa_helper.c
> @@ -804,28 +804,45 @@ void helper_msa_ ## func ## _df(CPUMIPSState *env, uint32_t > df,         \
>      wr_t *pwd = &(env->active_fpu.fpr[wd].wr);                          \
>      wr_t *pws = &(env->active_fpu.fpr[ws].wr);                          \
>      wr_t *pwt = &(env->active_fpu.fpr[wt].wr);                          \
> -    uint32_t i;                                                         \
>                                                                          \
>      switch (df) {                                                       \
>      case DF_BYTE:                                                       \
> -        for (i = 0; i < DF_ELEMENTS(DF_BYTE); i++) {                    \
> -            pwd->b[i] = msa_ ## func ## _df(df, pws->b[i], pwt->b[i]);  \
> -        }                                                               \
> +        pwd->b[0]  = msa_ ## func ## _df(df, pws->b[0], pwt->b[0]);     \
> +        pwd->b[1]  = msa_ ## func ## _df(df, pws->b[1], pwt->b[1]);     \
> +        pwd->b[2]  = msa_ ## func ## _df(df, pws->b[2], pwt->b[2]);     \
> +        pwd->b[3]  = msa_ ## func ## _df(df, pws->b[3], pwt->b[3]);     \
> +        pwd->b[4]  = msa_ ## func ## _df(df, pws->b[4], pwt->b[4]);     \
> +        pwd->b[5]  = msa_ ## func ## _df(df, pws->b[5], pwt->b[5]);     \
> +        pwd->b[6]  = msa_ ## func ## _df(df, pws->b[6], pwt->b[6]);     \
> +        pwd->b[7]  = msa_ ## func ## _df(df, pws->b[7], pwt->b[7]);     \
> +        pwd->b[8]  = msa_ ## func ## _df(df, pws->b[8], pwt->b[8]);     \
> +        pwd->b[9]  = msa_ ## func ## _df(df, pws->b[9], pwt->b[9]);     \
> +        pwd->b[10] = msa_ ## func ## _df(df, pws->b[10], pwt->b[10]);   \
> +        pwd->b[11] = msa_ ## func ## _df(df, pws->b[11], pwt->b[11]);   \
> +        pwd->b[12] = msa_ ## func ## _df(df, pws->b[12], pwt->b[12]);   \
> +        pwd->b[13] = msa_ ## func ## _df(df, pws->b[13], pwt->b[13]);   \
> +        pwd->b[14] = msa_ ## func ## _df(df, pws->b[14], pwt->b[14]);   \
> +        pwd->b[15] = msa_ ## func ## _df(df, pws->b[15], pwt->b[15]);   \
>          break;                                                          \
>      case DF_HALF:                                                       \
> -        for (i = 0; i < DF_ELEMENTS(DF_HALF); i++) {                    \
> -            pwd->h[i] = msa_ ## func ## _df(df, pws->h[i], pwt->h[i]);  \
> -        }                                                               \
> +        pwd->h[0] = msa_ ## func ## _df(df, pws->h[0], pwt->h[0]);      \
> +        pwd->h[1] = msa_ ## func ## _df(df, pws->h[1], pwt->h[1]);      \
> +        pwd->h[2] = msa_ ## func ## _df(df, pws->h[2], pwt->h[2]);      \
> +        pwd->h[3] = msa_ ## func ## _df(df, pws->h[3], pwt->h[3]);      \
> +        pwd->h[4] = msa_ ## func ## _df(df, pws->h[4], pwt->h[4]);      \
> +        pwd->h[5] = msa_ ## func ## _df(df, pws->h[5], pwt->h[5]);      \
> +        pwd->h[6] = msa_ ## func ## _df(df, pws->h[6], pwt->h[6]);      \
> +        pwd->h[7] = msa_ ## func ## _df(df, pws->h[7], pwt->h[7]);      \
>          break;                                                          \
>      case DF_WORD:                                                       \
> -        for (i = 0; i < DF_ELEMENTS(DF_WORD); i++) {                    \
> -            pwd->w[i] = msa_ ## func ## _df(df, pws->w[i], pwt->w[i]);  \
> -        }                                                               \
> +        pwd->w[0] = msa_ ## func ## _df(df, pws->w[0], pwt->w[0]);      \
> +        pwd->w[1] = msa_ ## func ## _df(df, pws->w[1], pwt->w[1]);      \
> +        pwd->w[2] = msa_ ## func ## _df(df, pws->w[2], pwt->w[2]);      \
> +        pwd->w[3] = msa_ ## func ## _df(df, pws->w[3], pwt->w[3]);      \
>          break;                                                          \
>      case DF_DOUBLE:                                                     \
> -        for (i = 0; i < DF_ELEMENTS(DF_DOUBLE); i++) {                  \
> -            pwd->d[i] = msa_ ## func ## _df(df, pws->d[i], pwt->d[i]);  \
> -        }                                                               \
> +        pwd->d[0] = msa_ ## func ## _df(df, pws->d[0], pwt->d[0]);      \
> +        pwd->d[1] = msa_ ## func ## _df(df, pws->d[1], pwt->d[1]);      \
>          break;                                                          \
>      default:                                                            \
>          assert(0);                                                      \
> --
> 2.7.4
> 
>
Aleksandar Markovic June 2, 2019, 7:06 a.m. UTC | #2
On Jun 1, 2019 4:16 PM, "Aleksandar Markovic" <amarkovic@wavecomp.com>
wrote:
>
> > From: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
> > Sent: Monday, March 4, 2019 5:51 PM
> > To: qemu-devel@nongnu.org
> > Cc: aurelien@aurel32.net; Aleksandar Markovic; Aleksandar Rikalo
> > Subject: [PATCH 1/2] target/mips: Improve performance for MSA binary
operations
> >
> > From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>
> >
> > Eliminate loops for better performance.
> >
> > Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
> > ---
> >  target/mips/msa_helper.c | 43
++++++++++++++++++++++++++++++-------------
> >  1 file changed, 30 insertions(+), 13 deletions(-)
> >
>
> The commit message should be a little bit more informative - for example,
> it could list the affected instructions. Please consider other groups of
> MSA instructions that are implemented via helpers that use similar "for"
> loops. Otherwise:
>
> Reviewed-by: Aleksandar Markovic <amarkovic@wavecomp.com>
>

Mateja, you don't need to do anything regarding this patch, I am going to
fix the issues while appying.

Thanks, Aleksandar

> > diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
> > index 4c7ec05..1152fda 100644
> > --- a/target/mips/msa_helper.c
> > +++ b/target/mips/msa_helper.c
> > @@ -804,28 +804,45 @@ void helper_msa_ ## func ## _df(CPUMIPSState
*env, uint32_t > df,         \
> >      wr_t *pwd = &(env->active_fpu.fpr[wd].wr);
  \
> >      wr_t *pws = &(env->active_fpu.fpr[ws].wr);
  \
> >      wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
  \
> > -    uint32_t i;
 \
> >
  \
> >      switch (df) {
 \
> >      case DF_BYTE:
 \
> > -        for (i = 0; i < DF_ELEMENTS(DF_BYTE); i++) {
  \
> > -            pwd->b[i] = msa_ ## func ## _df(df, pws->b[i],
pwt->b[i]);  \
> > -        }
 \
> > +        pwd->b[0]  = msa_ ## func ## _df(df, pws->b[0], pwt->b[0]);
 \
> > +        pwd->b[1]  = msa_ ## func ## _df(df, pws->b[1], pwt->b[1]);
 \
> > +        pwd->b[2]  = msa_ ## func ## _df(df, pws->b[2], pwt->b[2]);
 \
> > +        pwd->b[3]  = msa_ ## func ## _df(df, pws->b[3], pwt->b[3]);
 \
> > +        pwd->b[4]  = msa_ ## func ## _df(df, pws->b[4], pwt->b[4]);
 \
> > +        pwd->b[5]  = msa_ ## func ## _df(df, pws->b[5], pwt->b[5]);
 \
> > +        pwd->b[6]  = msa_ ## func ## _df(df, pws->b[6], pwt->b[6]);
 \
> > +        pwd->b[7]  = msa_ ## func ## _df(df, pws->b[7], pwt->b[7]);
 \
> > +        pwd->b[8]  = msa_ ## func ## _df(df, pws->b[8], pwt->b[8]);
 \
> > +        pwd->b[9]  = msa_ ## func ## _df(df, pws->b[9], pwt->b[9]);
 \
> > +        pwd->b[10] = msa_ ## func ## _df(df, pws->b[10], pwt->b[10]);
 \
> > +        pwd->b[11] = msa_ ## func ## _df(df, pws->b[11], pwt->b[11]);
 \
> > +        pwd->b[12] = msa_ ## func ## _df(df, pws->b[12], pwt->b[12]);
 \
> > +        pwd->b[13] = msa_ ## func ## _df(df, pws->b[13], pwt->b[13]);
 \
> > +        pwd->b[14] = msa_ ## func ## _df(df, pws->b[14], pwt->b[14]);
 \
> > +        pwd->b[15] = msa_ ## func ## _df(df, pws->b[15], pwt->b[15]);
 \
> >          break;
  \
> >      case DF_HALF:
 \
> > -        for (i = 0; i < DF_ELEMENTS(DF_HALF); i++) {
  \
> > -            pwd->h[i] = msa_ ## func ## _df(df, pws->h[i],
pwt->h[i]);  \
> > -        }
 \
> > +        pwd->h[0] = msa_ ## func ## _df(df, pws->h[0], pwt->h[0]);
  \
> > +        pwd->h[1] = msa_ ## func ## _df(df, pws->h[1], pwt->h[1]);
  \
> > +        pwd->h[2] = msa_ ## func ## _df(df, pws->h[2], pwt->h[2]);
  \
> > +        pwd->h[3] = msa_ ## func ## _df(df, pws->h[3], pwt->h[3]);
  \
> > +        pwd->h[4] = msa_ ## func ## _df(df, pws->h[4], pwt->h[4]);
  \
> > +        pwd->h[5] = msa_ ## func ## _df(df, pws->h[5], pwt->h[5]);
  \
> > +        pwd->h[6] = msa_ ## func ## _df(df, pws->h[6], pwt->h[6]);
  \
> > +        pwd->h[7] = msa_ ## func ## _df(df, pws->h[7], pwt->h[7]);
  \
> >          break;
  \
> >      case DF_WORD:
 \
> > -        for (i = 0; i < DF_ELEMENTS(DF_WORD); i++) {
  \
> > -            pwd->w[i] = msa_ ## func ## _df(df, pws->w[i],
pwt->w[i]);  \
> > -        }
 \
> > +        pwd->w[0] = msa_ ## func ## _df(df, pws->w[0], pwt->w[0]);
  \
> > +        pwd->w[1] = msa_ ## func ## _df(df, pws->w[1], pwt->w[1]);
  \
> > +        pwd->w[2] = msa_ ## func ## _df(df, pws->w[2], pwt->w[2]);
  \
> > +        pwd->w[3] = msa_ ## func ## _df(df, pws->w[3], pwt->w[3]);
  \
> >          break;
  \
> >      case DF_DOUBLE:
 \
> > -        for (i = 0; i < DF_ELEMENTS(DF_DOUBLE); i++) {
  \
> > -            pwd->d[i] = msa_ ## func ## _df(df, pws->d[i],
pwt->d[i]);  \
> > -        }
 \
> > +        pwd->d[0] = msa_ ## func ## _df(df, pws->d[0], pwt->d[0]);
  \
> > +        pwd->d[1] = msa_ ## func ## _df(df, pws->d[1], pwt->d[1]);
  \
> >          break;
  \
> >      default:
  \
> >          assert(0);
  \
> > --
> > 2.7.4
> >
> >
Alex Bennée June 2, 2019, 1:22 p.m. UTC | #3
Mateja Marjanovic <mateja.marjanovic@rt-rk.com> writes:

> From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>
>
> Eliminate loops for better performance.

Have you done any measurements of the bellow loop unrolling? Because
this is something that maybe we can achieve and let the compiler make
the choice.

>
> Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
> ---
>  target/mips/msa_helper.c | 43 ++++++++++++++++++++++++++++++-------------
>  1 file changed, 30 insertions(+), 13 deletions(-)
>
> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
> index 4c7ec05..1152fda 100644
> --- a/target/mips/msa_helper.c
> +++ b/target/mips/msa_helper.c
> @@ -804,28 +804,45 @@ void helper_msa_ ## func ## _df(CPUMIPSState *env, uint32_t df,         \
>      wr_t *pwd = &(env->active_fpu.fpr[wd].wr);                          \
>      wr_t *pws = &(env->active_fpu.fpr[ws].wr);                          \
>      wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
> \

If we can ensure alignment for the various vector registers then the
compiler always has the option of using host vectors (certainly for int
and logic operations).

> -    uint32_t i;                                                         \
>                                                                          \

>      switch (df) {                                                       \
>      case DF_BYTE:                                                       \
> -        for (i = 0; i < DF_ELEMENTS(DF_BYTE); i++) {                    \
> -            pwd->b[i] = msa_ ## func ## _df(df, pws->b[i], pwt->b[i]);  \
> -        }                                                               \
> +        pwd->b[0]  = msa_ ## func ## _df(df, pws->b[0], pwt->b[0]);     \
> +        pwd->b[1]  = msa_ ## func ## _df(df, pws->b[1], pwt->b[1]);     \
> +        pwd->b[2]  = msa_ ## func ## _df(df, pws->b[2], pwt->b[2]);     \
> +        pwd->b[3]  = msa_ ## func ## _df(df, pws->b[3], pwt->b[3]);     \
> +        pwd->b[4]  = msa_ ## func ## _df(df, pws->b[4], pwt->b[4]);     \
> +        pwd->b[5]  = msa_ ## func ## _df(df, pws->b[5], pwt->b[5]);     \
> +        pwd->b[6]  = msa_ ## func ## _df(df, pws->b[6], pwt->b[6]);     \
> +        pwd->b[7]  = msa_ ## func ## _df(df, pws->b[7], pwt->b[7]);     \
> +        pwd->b[8]  = msa_ ## func ## _df(df, pws->b[8], pwt->b[8]);     \
> +        pwd->b[9]  = msa_ ## func ## _df(df, pws->b[9], pwt->b[9]);     \
> +        pwd->b[10] = msa_ ## func ## _df(df, pws->b[10], pwt->b[10]);   \
> +        pwd->b[11] = msa_ ## func ## _df(df, pws->b[11], pwt->b[11]);   \
> +        pwd->b[12] = msa_ ## func ## _df(df, pws->b[12], pwt->b[12]);   \
> +        pwd->b[13] = msa_ ## func ## _df(df, pws->b[13], pwt->b[13]);   \
> +        pwd->b[14] = msa_ ## func ## _df(df, pws->b[14], pwt->b[14]);   \
> +        pwd->b[15] = msa_ ## func ## _df(df, pws->b[15], pwt->b[15]);   \
>          break;                                                          \
>      case DF_HALF:                                                       \
> -        for (i = 0; i < DF_ELEMENTS(DF_HALF); i++) {                    \
> -            pwd->h[i] = msa_ ## func ## _df(df, pws->h[i], pwt->h[i]);  \
> -        }                                                               \
> +        pwd->h[0] = msa_ ## func ## _df(df, pws->h[0], pwt->h[0]);      \
> +        pwd->h[1] = msa_ ## func ## _df(df, pws->h[1], pwt->h[1]);      \
> +        pwd->h[2] = msa_ ## func ## _df(df, pws->h[2], pwt->h[2]);      \
> +        pwd->h[3] = msa_ ## func ## _df(df, pws->h[3], pwt->h[3]);      \
> +        pwd->h[4] = msa_ ## func ## _df(df, pws->h[4], pwt->h[4]);      \
> +        pwd->h[5] = msa_ ## func ## _df(df, pws->h[5], pwt->h[5]);      \
> +        pwd->h[6] = msa_ ## func ## _df(df, pws->h[6], pwt->h[6]);      \
> +        pwd->h[7] = msa_ ## func ## _df(df, pws->h[7], pwt->h[7]);      \
>          break;                                                          \
>      case DF_WORD:                                                       \
> -        for (i = 0; i < DF_ELEMENTS(DF_WORD); i++) {                    \
> -            pwd->w[i] = msa_ ## func ## _df(df, pws->w[i], pwt->w[i]);  \
> -        }                                                               \
> +        pwd->w[0] = msa_ ## func ## _df(df, pws->w[0], pwt->w[0]);      \
> +        pwd->w[1] = msa_ ## func ## _df(df, pws->w[1], pwt->w[1]);      \
> +        pwd->w[2] = msa_ ## func ## _df(df, pws->w[2], pwt->w[2]);      \
> +        pwd->w[3] = msa_ ## func ## _df(df, pws->w[3], pwt->w[3]);      \
>          break;                                                          \
>      case DF_DOUBLE:                                                     \
> -        for (i = 0; i < DF_ELEMENTS(DF_DOUBLE); i++) {                  \
> -            pwd->d[i] = msa_ ## func ## _df(df, pws->d[i], pwt->d[i]);  \
> -        }                                                               \
> +        pwd->d[0] = msa_ ## func ## _df(df, pws->d[0], pwt->d[0]);      \
> +        pwd->d[1] = msa_ ## func ## _df(df, pws->d[1], pwt->d[1]);      \
>          break;                                                          \
>      default:                                                            \
>          assert(0);                                                      \


--
Alex Bennée
Mateja Marjanovic June 3, 2019, 9:46 a.m. UTC | #4
On 2.6.19. 09:06, Aleksandar Markovic wrote:
>
>
> On Jun 1, 2019 4:16 PM, "Aleksandar Markovic" <amarkovic@wavecomp.com 
> <mailto:amarkovic@wavecomp.com>> wrote:
> >
> > > From: Mateja Marjanovic <mateja.marjanovic@rt-rk.com 
> <mailto:mateja.marjanovic@rt-rk.com>>
> > > Sent: Monday, March 4, 2019 5:51 PM
> > > To: qemu-devel@nongnu.org <mailto:qemu-devel@nongnu.org>
> > > Cc: aurelien@aurel32.net <mailto:aurelien@aurel32.net>; Aleksandar 
> Markovic; Aleksandar Rikalo
> > > Subject: [PATCH 1/2] target/mips: Improve performance for MSA 
> binary operations
> > >
> > > From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com 
> <mailto:Mateja.Marjanovic@rt-rk.com>>
> > >
> > > Eliminate loops for better performance.
> > >
> > > Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com 
> <mailto:mateja.marjanovic@rt-rk.com>>
> > > ---
> > >  target/mips/msa_helper.c | 43 
> ++++++++++++++++++++++++++++++-------------
> > >  1 file changed, 30 insertions(+), 13 deletions(-)
> > >
> >
> > The commit message should be a little bit more informative - for 
> example,
> > it could list the affected instructions. Please consider other groups of
> > MSA instructions that are implemented via helpers that use similar "for"
> > loops. Otherwise:
> >
> > Reviewed-by: Aleksandar Markovic <amarkovic@wavecomp.com 
> <mailto:amarkovic@wavecomp.com>>
> >
>
> Mateja, you don't need to do anything regarding this patch, I am going 
> to fix the issues while appying.
>
Alright, thanks. :)

Regards,
Mateja
>
> Thanks, Aleksandar
>
> > > diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
> > > index 4c7ec05..1152fda 100644
> > > --- a/target/mips/msa_helper.c
> > > +++ b/target/mips/msa_helper.c
> > > @@ -804,28 +804,45 @@ void helper_msa_ ## func ## _df(CPUMIPSState 
> *env, uint32_t > df,         \
> > >      wr_t *pwd = &(env->active_fpu.fpr[wd].wr);                    
>       \
> > >      wr_t *pws = &(env->active_fpu.fpr[ws].wr);                    
>       \
> > >      wr_t *pwt = &(env->active_fpu.fpr[wt].wr);                    
>       \
> > > -    uint32_t i;                    \
> > >                   \
> > >      switch (df) {                    \
> > >      case DF_BYTE:                    \
> > > -        for (i = 0; i < DF_ELEMENTS(DF_BYTE); i++) {              
>       \
> > > -            pwd->b[i] = msa_ ## func ## _df(df, pws->b[i], 
> pwt->b[i]);  \
> > > -        }                    \
> > > +        pwd->b[0]  = msa_ ## func ## _df(df, pws->b[0], 
> pwt->b[0]);     \
> > > +        pwd->b[1]  = msa_ ## func ## _df(df, pws->b[1], 
> pwt->b[1]);     \
> > > +        pwd->b[2]  = msa_ ## func ## _df(df, pws->b[2], 
> pwt->b[2]);     \
> > > +        pwd->b[3]  = msa_ ## func ## _df(df, pws->b[3], 
> pwt->b[3]);     \
> > > +        pwd->b[4]  = msa_ ## func ## _df(df, pws->b[4], 
> pwt->b[4]);     \
> > > +        pwd->b[5]  = msa_ ## func ## _df(df, pws->b[5], 
> pwt->b[5]);     \
> > > +        pwd->b[6]  = msa_ ## func ## _df(df, pws->b[6], 
> pwt->b[6]);     \
> > > +        pwd->b[7]  = msa_ ## func ## _df(df, pws->b[7], 
> pwt->b[7]);     \
> > > +        pwd->b[8]  = msa_ ## func ## _df(df, pws->b[8], 
> pwt->b[8]);     \
> > > +        pwd->b[9]  = msa_ ## func ## _df(df, pws->b[9], 
> pwt->b[9]);     \
> > > +        pwd->b[10] = msa_ ## func ## _df(df, pws->b[10], 
> pwt->b[10]);   \
> > > +        pwd->b[11] = msa_ ## func ## _df(df, pws->b[11], 
> pwt->b[11]);   \
> > > +        pwd->b[12] = msa_ ## func ## _df(df, pws->b[12], 
> pwt->b[12]);   \
> > > +        pwd->b[13] = msa_ ## func ## _df(df, pws->b[13], 
> pwt->b[13]);   \
> > > +        pwd->b[14] = msa_ ## func ## _df(df, pws->b[14], 
> pwt->b[14]);   \
> > > +        pwd->b[15] = msa_ ## func ## _df(df, pws->b[15], 
> pwt->b[15]);   \
> > >          break;                   \
> > >      case DF_HALF:                    \
> > > -        for (i = 0; i < DF_ELEMENTS(DF_HALF); i++) {              
>       \
> > > -            pwd->h[i] = msa_ ## func ## _df(df, pws->h[i], 
> pwt->h[i]);  \
> > > -        }                    \
> > > +        pwd->h[0] = msa_ ## func ## _df(df, pws->h[0], 
> pwt->h[0]);      \
> > > +        pwd->h[1] = msa_ ## func ## _df(df, pws->h[1], 
> pwt->h[1]);      \
> > > +        pwd->h[2] = msa_ ## func ## _df(df, pws->h[2], 
> pwt->h[2]);      \
> > > +        pwd->h[3] = msa_ ## func ## _df(df, pws->h[3], 
> pwt->h[3]);      \
> > > +        pwd->h[4] = msa_ ## func ## _df(df, pws->h[4], 
> pwt->h[4]);      \
> > > +        pwd->h[5] = msa_ ## func ## _df(df, pws->h[5], 
> pwt->h[5]);      \
> > > +        pwd->h[6] = msa_ ## func ## _df(df, pws->h[6], 
> pwt->h[6]);      \
> > > +        pwd->h[7] = msa_ ## func ## _df(df, pws->h[7], 
> pwt->h[7]);      \
> > >          break;                   \
> > >      case DF_WORD:                    \
> > > -        for (i = 0; i < DF_ELEMENTS(DF_WORD); i++) {              
>       \
> > > -            pwd->w[i] = msa_ ## func ## _df(df, pws->w[i], 
> pwt->w[i]);  \
> > > -        }                    \
> > > +        pwd->w[0] = msa_ ## func ## _df(df, pws->w[0], 
> pwt->w[0]);      \
> > > +        pwd->w[1] = msa_ ## func ## _df(df, pws->w[1], 
> pwt->w[1]);      \
> > > +        pwd->w[2] = msa_ ## func ## _df(df, pws->w[2], 
> pwt->w[2]);      \
> > > +        pwd->w[3] = msa_ ## func ## _df(df, pws->w[3], 
> pwt->w[3]);      \
> > >          break;                   \
> > >      case DF_DOUBLE:                    \
> > > -        for (i = 0; i < DF_ELEMENTS(DF_DOUBLE); i++) {            
>       \
> > > -            pwd->d[i] = msa_ ## func ## _df(df, pws->d[i], 
> pwt->d[i]);  \
> > > -        }                    \
> > > +        pwd->d[0] = msa_ ## func ## _df(df, pws->d[0], 
> pwt->d[0]);      \
> > > +        pwd->d[1] = msa_ ## func ## _df(df, pws->d[1], 
> pwt->d[1]);      \
> > >          break;                   \
> > >      default:                   \
> > >          assert(0);                   \
> > > --
> > > 2.7.4
> > >
> > >
>
Aleksandar Markovic June 3, 2019, 1:10 p.m. UTC | #5
> From: Alex Bennée <alex.bennee@linaro.org>
> Sent: Sunday, June 2, 2019 3:22 PM
> To: qemu-devel@nongnu.org
> Cc: Aleksandar Rikalo; Aleksandar Markovic; aurelien@aurel32.net
> Subject: Re: [Qemu-devel] [PATCH 1/2] target/mips: Improve performance for MSA binary operations


> Mateja Marjanovic <mateja.marjanovic@rt-rk.com> writes:

> > From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>
> >
> > Eliminate loops for better performance.

> Have you done any measurements of the bellow loop unrolling? Because
> this is something that maybe we can achieve and let the compiler make
> the choice.

I know that Mateja did extensive performance measurements, and I am
asking him to give us some samples.

As for code generation, here are disassemblies of function
helper_msa_add_a_df() before and after this patch:

(it is visible the compiler did not perform unrolling loops by itself)

BEFORE:

Dump of assembler code for function helper_msa_add_a_df:
   0x00000000001500b0 <+0>:	cmp    $0x1,%esi
   0x00000000001500b3 <+3>:	je     0x150258 <helper_msa_add_a_df+424>
   0x00000000001500b9 <+9>:	jb     0x1501e8 <helper_msa_add_a_df+312>
   0x00000000001500bf <+15>:	cmp    $0x2,%esi
   0x00000000001500c2 <+18>:	je     0x150180 <helper_msa_add_a_df+208>
   0x00000000001500c8 <+24>:	cmp    $0x3,%esi
   0x00000000001500cb <+27>:	jne    0x1502c2 <helper_msa_add_a_df+530>
   0x00000000001500d1 <+33>:	mov    %ecx,%ecx
   0x00000000001500d3 <+35>:	mov    %edx,%edx
   0x00000000001500d5 <+37>:	lea    0x22(%rcx),%rax
   0x00000000001500d9 <+41>:	lea    0x22(%rdx),%r10
   0x00000000001500dd <+45>:	shl    $0x4,%rcx
   0x00000000001500e1 <+49>:	add    %rdi,%rcx
   0x00000000001500e4 <+52>:	shl    $0x4,%rdx
   0x00000000001500e8 <+56>:	shl    $0x4,%rax
   0x00000000001500ec <+60>:	shl    $0x4,%r10
   0x00000000001500f0 <+64>:	add    %rdi,%rax
   0x00000000001500f3 <+67>:	mov    0x8(%rax),%r9
   0x00000000001500f7 <+71>:	mov    0x8(%rax),%rsi
   0x00000000001500fb <+75>:	mov    %r8d,%eax
   0x00000000001500fe <+78>:	sar    $0x3f,%r9
   0x0000000000150102 <+82>:	xor    %r9,%rsi
   0x0000000000150105 <+85>:	sub    %r9,%rsi
   0x0000000000150108 <+88>:	mov    %rsi,%r9
   0x000000000015010b <+91>:	lea    0x22(%rax),%rsi
   0x000000000015010f <+95>:	shl    $0x4,%rax
   0x0000000000150113 <+99>:	lea    (%rdi,%rax,1),%rax
   0x0000000000150117 <+103>:	shl    $0x4,%rsi
   0x000000000015011b <+107>:	add    %rdi,%rsi
   0x000000000015011e <+110>:	mov    0x8(%rsi),%r8
   0x0000000000150122 <+114>:	mov    0x8(%rsi),%r11
   0x0000000000150126 <+118>:	sar    $0x3f,%r8
   0x000000000015012a <+122>:	xor    %r8,%r11
   0x000000000015012d <+125>:	mov    %r11,%rsi
   0x0000000000150130 <+128>:	sub    %r8,%rsi
   0x0000000000150133 <+131>:	add    %r9,%rsi
   0x0000000000150136 <+134>:	mov    %rsi,0x8(%rdi,%r10,1)
   0x000000000015013b <+139>:	mov    0x230(%rcx),%rsi
   0x0000000000150142 <+146>:	mov    0x230(%rcx),%r8
   0x0000000000150149 <+153>:	sar    $0x3f,%rsi
   0x000000000015014d <+157>:	xor    %rsi,%r8
   0x0000000000150150 <+160>:	mov    %r8,%rcx
   0x0000000000150153 <+163>:	mov    0x230(%rax),%r8
   0x000000000015015a <+170>:	sub    %rsi,%rcx
   0x000000000015015d <+173>:	mov    0x230(%rax),%rsi
   0x0000000000150164 <+180>:	sar    $0x3f,%rsi
   0x0000000000150168 <+184>:	xor    %rsi,%r8
   0x000000000015016b <+187>:	mov    %r8,%rax
   0x000000000015016e <+190>:	sub    %rsi,%rax
   0x0000000000150171 <+193>:	add    %rcx,%rax
   0x0000000000150174 <+196>:	mov    %rax,0x230(%rdi,%rdx,1)
   0x000000000015017c <+204>:	retq   
   0x000000000015017d <+205>:	nopl   (%rax)
   0x0000000000150180 <+208>:	mov    %r8d,%r8d
   0x0000000000150183 <+211>:	mov    %ecx,%ecx
   0x0000000000150185 <+213>:	mov    %edx,%edx
   0x0000000000150187 <+215>:	mov    %r8,%rax
   0x000000000015018a <+218>:	neg    %r8
   0x000000000015018d <+221>:	shl    $0x4,%rcx
   0x0000000000150191 <+225>:	shl    $0x4,%rax
   0x0000000000150195 <+229>:	shl    $0x4,%r8
   0x0000000000150199 <+233>:	shl    $0x4,%rdx
   0x000000000015019d <+237>:	lea    0x228(%rdi,%rax,1),%r9
   0x00000000001501a5 <+245>:	lea    0x238(%rdi,%rax,1),%rdi
   0x00000000001501ad <+253>:	lea    (%r9,%r8,1),%r10
   0x00000000001501b1 <+257>:	add    $0x4,%r9
   0x00000000001501b5 <+261>:	movslq (%r10,%rcx,1),%rax
   0x00000000001501b9 <+265>:	mov    %rax,%rsi
   0x00000000001501bc <+268>:	sar    $0x3f,%rsi
   0x00000000001501c0 <+272>:	xor    %rsi,%rax
   0x00000000001501c3 <+275>:	sub    %rsi,%rax
   0x00000000001501c6 <+278>:	movslq -0x4(%r9),%rsi
   0x00000000001501ca <+282>:	mov    %rsi,%r11
   0x00000000001501cd <+285>:	sar    $0x3f,%r11
   0x00000000001501d1 <+289>:	xor    %r11,%rsi
   0x00000000001501d4 <+292>:	sub    %r11,%rsi
   0x00000000001501d7 <+295>:	add    %rsi,%rax
   0x00000000001501da <+298>:	cmp    %rdi,%r9
   0x00000000001501dd <+301>:	mov    %eax,(%r10,%rdx,1)
   0x00000000001501e1 <+305>:	jne    0x1501ad <helper_msa_add_a_df+253>
   0x00000000001501e3 <+307>:	repz retq 
   0x00000000001501e5 <+309>:	nopl   (%rax)
   0x00000000001501e8 <+312>:	mov    %r8d,%r8d
   0x00000000001501eb <+315>:	mov    %ecx,%ecx
   0x00000000001501ed <+317>:	mov    %edx,%edx
   0x00000000001501ef <+319>:	mov    %r8,%rax
   0x00000000001501f2 <+322>:	neg    %r8
   0x00000000001501f5 <+325>:	shl    $0x4,%rcx
   0x00000000001501f9 <+329>:	shl    $0x4,%rax
   0x00000000001501fd <+333>:	shl    $0x4,%r8
   0x0000000000150201 <+337>:	shl    $0x4,%rdx
   0x0000000000150205 <+341>:	lea    0x228(%rdi,%rax,1),%r9
   0x000000000015020d <+349>:	lea    0x238(%rdi,%rax,1),%r11
   0x0000000000150215 <+357>:	nopl   (%rax)
   0x0000000000150218 <+360>:	lea    (%r8,%r9,1),%rdi
   0x000000000015021c <+364>:	add    $0x1,%r9
   0x0000000000150220 <+368>:	movsbq (%rdi,%rcx,1),%rax
   0x0000000000150225 <+373>:	mov    %rax,%rsi
   0x0000000000150228 <+376>:	sar    $0x3f,%rsi
   0x000000000015022c <+380>:	xor    %rsi,%rax
   0x000000000015022f <+383>:	sub    %rsi,%rax
   0x0000000000150232 <+386>:	movsbq -0x1(%r9),%rsi
   0x0000000000150237 <+391>:	mov    %rsi,%r10
   0x000000000015023a <+394>:	sar    $0x3f,%r10
   0x000000000015023e <+398>:	xor    %r10,%rsi
   0x0000000000150241 <+401>:	sub    %r10,%rsi
   0x0000000000150244 <+404>:	add    %rsi,%rax
   0x0000000000150247 <+407>:	cmp    %r9,%r11
   0x000000000015024a <+410>:	mov    %al,(%rdi,%rdx,1)
   0x000000000015024d <+413>:	jne    0x150218 <helper_msa_add_a_df+360>
   0x000000000015024f <+415>:	repz retq 
   0x0000000000150251 <+417>:	nopl   0x0(%rax)
   0x0000000000150258 <+424>:	mov    %r8d,%r8d
   0x000000000015025b <+427>:	mov    %ecx,%ecx
   0x000000000015025d <+429>:	mov    %edx,%edx
   0x000000000015025f <+431>:	mov    %r8,%rax
   0x0000000000150262 <+434>:	neg    %r8
   0x0000000000150265 <+437>:	shl    $0x4,%rcx
   0x0000000000150269 <+441>:	shl    $0x4,%rax
   0x000000000015026d <+445>:	shl    $0x4,%r8
   0x0000000000150271 <+449>:	shl    $0x4,%rdx
   0x0000000000150275 <+453>:	lea    0x228(%rdi,%rax,1),%r9
   0x000000000015027d <+461>:	lea    0x238(%rdi,%rax,1),%r10
   0x0000000000150285 <+469>:	nopl   (%rax)
   0x0000000000150288 <+472>:	lea    (%r8,%r9,1),%rdi
   0x000000000015028c <+476>:	add    $0x2,%r9
   0x0000000000150290 <+480>:	movswq (%rdi,%rcx,1),%rax
   0x0000000000150295 <+485>:	mov    %rax,%rsi
   0x0000000000150298 <+488>:	sar    $0x3f,%rsi
   0x000000000015029c <+492>:	xor    %rsi,%rax
   0x000000000015029f <+495>:	sub    %rsi,%rax
   0x00000000001502a2 <+498>:	movswq -0x2(%r9),%rsi
   0x00000000001502a7 <+503>:	mov    %rsi,%r11
   0x00000000001502aa <+506>:	sar    $0x3f,%r11
   0x00000000001502ae <+510>:	xor    %r11,%rsi
   0x00000000001502b1 <+513>:	sub    %r11,%rsi
   0x00000000001502b4 <+516>:	add    %rsi,%rax
   0x00000000001502b7 <+519>:	cmp    %r10,%r9
   0x00000000001502ba <+522>:	mov    %ax,(%rdi,%rdx,1)
   0x00000000001502be <+526>:	jne    0x150288 <helper_msa_add_a_df+472>
   0x00000000001502c0 <+528>:	repz retq 
   0x00000000001502c2 <+530>:	lea    0x13c3b7(%rip),%rcx        # 0x28c680 <__PRETTY_FUNCTION__.26062>
   0x00000000001502c9 <+537>:	lea    0x13b830(%rip),%rsi        # 0x28bb00
   0x00000000001502d0 <+544>:	lea    0x1c7204(%rip),%rdi        # 0x3174db
   0x00000000001502d7 <+551>:	sub    $0x8,%rsp
   0x00000000001502db <+555>:	mov    $0x357,%edx
   0x00000000001502e0 <+560>:	callq  0x8eeb8
End of assembler dump.


AFTER:

   0x00000000001548d0 <+0>:	cmp    $0x1,%esi
   0x00000000001548d3 <+3>:	je     0x154e00 <helper_msa_add_a_df+1328>
   0x00000000001548d9 <+9>:	jb     0x154a98 <helper_msa_add_a_df+456>
   0x00000000001548df <+15>:	cmp    $0x2,%esi
   0x00000000001548e2 <+18>:	je     0x1549a0 <helper_msa_add_a_df+208>
   0x00000000001548e8 <+24>:	cmp    $0x3,%esi
   0x00000000001548eb <+27>:	jne    0x154fd1 <helper_msa_add_a_df+1793>
   0x00000000001548f1 <+33>:	mov    %ecx,%eax
   0x00000000001548f3 <+35>:	mov    %r8d,%r8d
   0x00000000001548f6 <+38>:	mov    %edx,%edx
   0x00000000001548f8 <+40>:	lea    0x22(%rax),%rcx
   0x00000000001548fc <+44>:	lea    0x22(%rdx),%r9
   0x0000000000154900 <+48>:	shl    $0x4,%rax
   0x0000000000154904 <+52>:	add    %rdi,%rax
   0x0000000000154907 <+55>:	shl    $0x4,%rdx
   0x000000000015490b <+59>:	shl    $0x4,%rcx
   0x000000000015490f <+63>:	shl    $0x4,%r9
   0x0000000000154913 <+67>:	add    %rdi,%rcx
   0x0000000000154916 <+70>:	mov    0x8(%rcx),%rsi
   0x000000000015491a <+74>:	mov    0x8(%rcx),%r11
   0x000000000015491e <+78>:	sar    $0x3f,%rsi
   0x0000000000154922 <+82>:	xor    %rsi,%r11
   0x0000000000154925 <+85>:	mov    %r11,%rcx
   0x0000000000154928 <+88>:	sub    %rsi,%rcx
   0x000000000015492b <+91>:	mov    %rcx,%rsi
   0x000000000015492e <+94>:	lea    0x22(%r8),%rcx
   0x0000000000154932 <+98>:	shl    $0x4,%r8
   0x0000000000154936 <+102>:	add    %rdi,%r8
   0x0000000000154939 <+105>:	shl    $0x4,%rcx
   0x000000000015493d <+109>:	add    %rdi,%rcx
   0x0000000000154940 <+112>:	mov    0x8(%rcx),%r10
   0x0000000000154944 <+116>:	mov    0x8(%rcx),%r11
   0x0000000000154948 <+120>:	sar    $0x3f,%r10
   0x000000000015494c <+124>:	xor    %r10,%r11
   0x000000000015494f <+127>:	mov    %r11,%rcx
   0x0000000000154952 <+130>:	sub    %r10,%rcx
   0x0000000000154955 <+133>:	add    %rsi,%rcx
   0x0000000000154958 <+136>:	mov    %rcx,0x8(%rdi,%r9,1)
   0x000000000015495d <+141>:	mov    0x230(%rax),%rcx
   0x0000000000154964 <+148>:	mov    0x230(%rax),%rsi
   0x000000000015496b <+155>:	sar    $0x3f,%rcx
   0x000000000015496f <+159>:	xor    %rcx,%rsi
   0x0000000000154972 <+162>:	mov    %rsi,%rax
   0x0000000000154975 <+165>:	mov    0x230(%r8),%rsi
   0x000000000015497c <+172>:	sub    %rcx,%rax
   0x000000000015497f <+175>:	mov    %rax,%rcx
   0x0000000000154982 <+178>:	mov    0x230(%r8),%rax
   0x0000000000154989 <+185>:	sar    $0x3f,%rsi
   0x000000000015498d <+189>:	xor    %rsi,%rax
   0x0000000000154990 <+192>:	sub    %rsi,%rax
   0x0000000000154993 <+195>:	add    %rcx,%rax
   0x0000000000154996 <+198>:	mov    %rax,0x230(%rdi,%rdx,1)
   0x000000000015499e <+206>:	retq   
   0x000000000015499f <+207>:	nop
   0x00000000001549a0 <+208>:	mov    %ecx,%ecx
   0x00000000001549a2 <+210>:	mov    %r8d,%r8d
   0x00000000001549a5 <+213>:	mov    %edx,%edx
   0x00000000001549a7 <+215>:	lea    0x22(%rcx),%rax
   0x00000000001549ab <+219>:	lea    0x22(%rdx),%r9
   0x00000000001549af <+223>:	shl    $0x4,%rcx
   0x00000000001549b3 <+227>:	add    %rdi,%rcx
   0x00000000001549b6 <+230>:	shl    $0x4,%rdx
   0x00000000001549ba <+234>:	shl    $0x4,%rax
   0x00000000001549be <+238>:	shl    $0x4,%r9
   0x00000000001549c2 <+242>:	add    %rdi,%rdx
   0x00000000001549c5 <+245>:	movslq 0x8(%rdi,%rax,1),%rax
   0x00000000001549ca <+250>:	mov    %rax,%rsi
   0x00000000001549cd <+253>:	sar    $0x3f,%rsi
   0x00000000001549d1 <+257>:	xor    %rsi,%rax
   0x00000000001549d4 <+260>:	sub    %rsi,%rax
   0x00000000001549d7 <+263>:	lea    0x22(%r8),%rsi
   0x00000000001549db <+267>:	shl    $0x4,%r8
   0x00000000001549df <+271>:	shl    $0x4,%rsi
   0x00000000001549e3 <+275>:	movslq 0x8(%rdi,%rsi,1),%rsi
   0x00000000001549e8 <+280>:	mov    %rsi,%r10
   0x00000000001549eb <+283>:	sar    $0x3f,%r10
   0x00000000001549ef <+287>:	xor    %r10,%rsi
   0x00000000001549f2 <+290>:	sub    %r10,%rsi
   0x00000000001549f5 <+293>:	add    %rsi,%rax
   0x00000000001549f8 <+296>:	mov    %eax,0x8(%rdi,%r9,1)
   0x00000000001549fd <+301>:	movslq 0x22c(%rcx),%rax
   0x0000000000154a04 <+308>:	add    %r8,%rdi
   0x0000000000154a07 <+311>:	mov    %rax,%rsi
   0x0000000000154a0a <+314>:	sar    $0x3f,%rsi
   0x0000000000154a0e <+318>:	xor    %rsi,%rax
   0x0000000000154a11 <+321>:	sub    %rsi,%rax
   0x0000000000154a14 <+324>:	movslq 0x22c(%rdi),%rsi
   0x0000000000154a1b <+331>:	mov    %rsi,%r8
   0x0000000000154a1e <+334>:	sar    $0x3f,%r8
   0x0000000000154a22 <+338>:	xor    %r8,%rsi
   0x0000000000154a25 <+341>:	sub    %r8,%rsi
   0x0000000000154a28 <+344>:	add    %rsi,%rax
   0x0000000000154a2b <+347>:	mov    %eax,0x22c(%rdx)
   0x0000000000154a31 <+353>:	movslq 0x230(%rcx),%rax
   0x0000000000154a38 <+360>:	mov    %rax,%rsi
   0x0000000000154a3b <+363>:	sar    $0x3f,%rsi
   0x0000000000154a3f <+367>:	xor    %rsi,%rax
   0x0000000000154a42 <+370>:	sub    %rsi,%rax
   0x0000000000154a45 <+373>:	movslq 0x230(%rdi),%rsi
   0x0000000000154a4c <+380>:	mov    %rsi,%r8
   0x0000000000154a4f <+383>:	sar    $0x3f,%r8
   0x0000000000154a53 <+387>:	xor    %r8,%rsi
   0x0000000000154a56 <+390>:	sub    %r8,%rsi
   0x0000000000154a59 <+393>:	add    %rsi,%rax
   0x0000000000154a5c <+396>:	mov    %eax,0x230(%rdx)
   0x0000000000154a62 <+402>:	movslq 0x234(%rcx),%rax
   0x0000000000154a69 <+409>:	mov    %rax,%rcx
   0x0000000000154a6c <+412>:	sar    $0x3f,%rcx
   0x0000000000154a70 <+416>:	xor    %rcx,%rax
   0x0000000000154a73 <+419>:	sub    %rcx,%rax
   0x0000000000154a76 <+422>:	movslq 0x234(%rdi),%rcx
   0x0000000000154a7d <+429>:	mov    %rcx,%rsi
   0x0000000000154a80 <+432>:	sar    $0x3f,%rsi
   0x0000000000154a84 <+436>:	xor    %rsi,%rcx
   0x0000000000154a87 <+439>:	sub    %rsi,%rcx
   0x0000000000154a8a <+442>:	add    %rcx,%rax
   0x0000000000154a8d <+445>:	mov    %eax,0x234(%rdx)
   0x0000000000154a93 <+451>:	retq   
   0x0000000000154a94 <+452>:	nopl   0x0(%rax)
   0x0000000000154a98 <+456>:	mov    %ecx,%eax
   0x0000000000154a9a <+458>:	mov    %r8d,%r8d
   0x0000000000154a9d <+461>:	mov    %edx,%edx
   0x0000000000154a9f <+463>:	lea    0x22(%rax),%rcx
   0x0000000000154aa3 <+467>:	lea    0x22(%rdx),%r9
   0x0000000000154aa7 <+471>:	shl    $0x4,%rax
   0x0000000000154aab <+475>:	lea    (%rdi,%rax,1),%rax
   0x0000000000154aaf <+479>:	shl    $0x4,%rdx
   0x0000000000154ab3 <+483>:	shl    $0x4,%rcx
   0x0000000000154ab7 <+487>:	shl    $0x4,%r9
   0x0000000000154abb <+491>:	add    %rdi,%rdx
   0x0000000000154abe <+494>:	movsbq 0x8(%rdi,%rcx,1),%rsi
   0x0000000000154ac4 <+500>:	mov    %rsi,%rcx
   0x0000000000154ac7 <+503>:	sar    $0x3f,%rcx
   0x0000000000154acb <+507>:	xor    %rcx,%rsi
   0x0000000000154ace <+510>:	sub    %rcx,%rsi
   0x0000000000154ad1 <+513>:	lea    0x22(%r8),%rcx
   0x0000000000154ad5 <+517>:	shl    $0x4,%r8
   0x0000000000154ad9 <+521>:	shl    $0x4,%rcx
   0x0000000000154add <+525>:	movsbq 0x8(%rdi,%rcx,1),%rcx
   0x0000000000154ae3 <+531>:	mov    %rcx,%r10
   0x0000000000154ae6 <+534>:	sar    $0x3f,%r10
   0x0000000000154aea <+538>:	xor    %r10,%rcx
   0x0000000000154aed <+541>:	sub    %r10,%rcx
   0x0000000000154af0 <+544>:	add    %rcx,%rsi
   0x0000000000154af3 <+547>:	mov    %sil,0x8(%rdi,%r9,1)
   0x0000000000154af8 <+552>:	movsbq 0x229(%rax),%rcx
   0x0000000000154b00 <+560>:	add    %r8,%rdi
   0x0000000000154b03 <+563>:	mov    %rcx,%rsi
   0x0000000000154b06 <+566>:	sar    $0x3f,%rsi
   0x0000000000154b0a <+570>:	xor    %rsi,%rcx
   0x0000000000154b0d <+573>:	sub    %rsi,%rcx
   0x0000000000154b10 <+576>:	movsbq 0x229(%rdi),%rsi
   0x0000000000154b18 <+584>:	mov    %rsi,%r8
   0x0000000000154b1b <+587>:	sar    $0x3f,%r8
   0x0000000000154b1f <+591>:	xor    %r8,%rsi
   0x0000000000154b22 <+594>:	sub    %r8,%rsi
   0x0000000000154b25 <+597>:	add    %rsi,%rcx
   0x0000000000154b28 <+600>:	mov    %cl,0x229(%rdx)
   0x0000000000154b2e <+606>:	movsbq 0x22a(%rax),%rcx
   0x0000000000154b36 <+614>:	mov    %rcx,%rsi
   0x0000000000154b39 <+617>:	sar    $0x3f,%rsi
   0x0000000000154b3d <+621>:	xor    %rsi,%rcx
   0x0000000000154b40 <+624>:	sub    %rsi,%rcx
   0x0000000000154b43 <+627>:	movsbq 0x22a(%rdi),%rsi
   0x0000000000154b4b <+635>:	mov    %rsi,%r8
   0x0000000000154b4e <+638>:	sar    $0x3f,%r8
   0x0000000000154b52 <+642>:	xor    %r8,%rsi
   0x0000000000154b55 <+645>:	sub    %r8,%rsi
   0x0000000000154b58 <+648>:	add    %rsi,%rcx
   0x0000000000154b5b <+651>:	mov    %cl,0x22a(%rdx)
   0x0000000000154b61 <+657>:	movsbq 0x22b(%rax),%rcx
   0x0000000000154b69 <+665>:	mov    %rcx,%rsi
   0x0000000000154b6c <+668>:	sar    $0x3f,%rsi
   0x0000000000154b70 <+672>:	xor    %rsi,%rcx
   0x0000000000154b73 <+675>:	sub    %rsi,%rcx
   0x0000000000154b76 <+678>:	movsbq 0x22b(%rdi),%rsi
   0x0000000000154b7e <+686>:	mov    %rsi,%r8
   0x0000000000154b81 <+689>:	sar    $0x3f,%r8
   0x0000000000154b85 <+693>:	xor    %r8,%rsi
   0x0000000000154b88 <+696>:	sub    %r8,%rsi
   0x0000000000154b8b <+699>:	add    %rsi,%rcx
   0x0000000000154b8e <+702>:	mov    %cl,0x22b(%rdx)
   0x0000000000154b94 <+708>:	movsbq 0x22c(%rax),%rcx
   0x0000000000154b9c <+716>:	mov    %rcx,%rsi
   0x0000000000154b9f <+719>:	sar    $0x3f,%rsi
   0x0000000000154ba3 <+723>:	xor    %rsi,%rcx
   0x0000000000154ba6 <+726>:	sub    %rsi,%rcx
   0x0000000000154ba9 <+729>:	movsbq 0x22c(%rdi),%rsi
   0x0000000000154bb1 <+737>:	mov    %rsi,%r8
   0x0000000000154bb4 <+740>:	sar    $0x3f,%r8
   0x0000000000154bb8 <+744>:	xor    %r8,%rsi
   0x0000000000154bbb <+747>:	sub    %r8,%rsi
   0x0000000000154bbe <+750>:	add    %rsi,%rcx
   0x0000000000154bc1 <+753>:	mov    %cl,0x22c(%rdx)
   0x0000000000154bc7 <+759>:	movsbq 0x22d(%rax),%rcx
   0x0000000000154bcf <+767>:	mov    %rcx,%rsi
   0x0000000000154bd2 <+770>:	sar    $0x3f,%rsi
   0x0000000000154bd6 <+774>:	xor    %rsi,%rcx
   0x0000000000154bd9 <+777>:	sub    %rsi,%rcx
   0x0000000000154bdc <+780>:	movsbq 0x22d(%rdi),%rsi
   0x0000000000154be4 <+788>:	mov    %rsi,%r8
   0x0000000000154be7 <+791>:	sar    $0x3f,%r8
   0x0000000000154beb <+795>:	xor    %r8,%rsi
   0x0000000000154bee <+798>:	sub    %r8,%rsi
   0x0000000000154bf1 <+801>:	add    %rsi,%rcx
   0x0000000000154bf4 <+804>:	mov    %cl,0x22d(%rdx)
   0x0000000000154bfa <+810>:	movsbq 0x22e(%rax),%rcx
   0x0000000000154c02 <+818>:	mov    %rcx,%rsi
   0x0000000000154c05 <+821>:	sar    $0x3f,%rsi
   0x0000000000154c09 <+825>:	xor    %rsi,%rcx
   0x0000000000154c0c <+828>:	sub    %rsi,%rcx
   0x0000000000154c0f <+831>:	movsbq 0x22e(%rdi),%rsi
   0x0000000000154c17 <+839>:	mov    %rsi,%r8
   0x0000000000154c1a <+842>:	sar    $0x3f,%r8
   0x0000000000154c1e <+846>:	xor    %r8,%rsi
   0x0000000000154c21 <+849>:	sub    %r8,%rsi
   0x0000000000154c24 <+852>:	add    %rsi,%rcx
   0x0000000000154c27 <+855>:	mov    %cl,0x22e(%rdx)
   0x0000000000154c2d <+861>:	movsbq 0x22f(%rax),%rcx
   0x0000000000154c35 <+869>:	mov    %rcx,%rsi
   0x0000000000154c38 <+872>:	sar    $0x3f,%rsi
   0x0000000000154c3c <+876>:	xor    %rsi,%rcx
   0x0000000000154c3f <+879>:	sub    %rsi,%rcx
   0x0000000000154c42 <+882>:	movsbq 0x22f(%rdi),%rsi
   0x0000000000154c4a <+890>:	mov    %rsi,%r8
   0x0000000000154c4d <+893>:	sar    $0x3f,%r8
   0x0000000000154c51 <+897>:	xor    %r8,%rsi
   0x0000000000154c54 <+900>:	sub    %r8,%rsi
   0x0000000000154c57 <+903>:	add    %rsi,%rcx
   0x0000000000154c5a <+906>:	mov    %cl,0x22f(%rdx)
   0x0000000000154c60 <+912>:	movsbq 0x230(%rax),%rcx
   0x0000000000154c68 <+920>:	mov    %rcx,%rsi
   0x0000000000154c6b <+923>:	sar    $0x3f,%rsi
   0x0000000000154c6f <+927>:	xor    %rsi,%rcx
   0x0000000000154c72 <+930>:	sub    %rsi,%rcx
   0x0000000000154c75 <+933>:	movsbq 0x230(%rdi),%rsi
   0x0000000000154c7d <+941>:	mov    %rsi,%r8
   0x0000000000154c80 <+944>:	sar    $0x3f,%r8
   0x0000000000154c84 <+948>:	xor    %r8,%rsi
   0x0000000000154c87 <+951>:	sub    %r8,%rsi
   0x0000000000154c8a <+954>:	add    %rsi,%rcx
   0x0000000000154c8d <+957>:	mov    %cl,0x230(%rdx)
   0x0000000000154c93 <+963>:	movsbq 0x231(%rax),%rcx
   0x0000000000154c9b <+971>:	mov    %rcx,%rsi
   0x0000000000154c9e <+974>:	sar    $0x3f,%rsi
   0x0000000000154ca2 <+978>:	xor    %rsi,%rcx
   0x0000000000154ca5 <+981>:	sub    %rsi,%rcx
   0x0000000000154ca8 <+984>:	movsbq 0x231(%rdi),%rsi
   0x0000000000154cb0 <+992>:	mov    %rsi,%r8
   0x0000000000154cb3 <+995>:	sar    $0x3f,%r8
   0x0000000000154cb7 <+999>:	xor    %r8,%rsi
   0x0000000000154cba <+1002>:	sub    %r8,%rsi
   0x0000000000154cbd <+1005>:	add    %rsi,%rcx
   0x0000000000154cc0 <+1008>:	mov    %cl,0x231(%rdx)
   0x0000000000154cc6 <+1014>:	movsbq 0x232(%rax),%rcx
   0x0000000000154cce <+1022>:	mov    %rcx,%rsi
   0x0000000000154cd1 <+1025>:	sar    $0x3f,%rsi
   0x0000000000154cd5 <+1029>:	xor    %rsi,%rcx
   0x0000000000154cd8 <+1032>:	sub    %rsi,%rcx
   0x0000000000154cdb <+1035>:	movsbq 0x232(%rdi),%rsi
   0x0000000000154ce3 <+1043>:	mov    %rsi,%r8
   0x0000000000154ce6 <+1046>:	sar    $0x3f,%r8
   0x0000000000154cea <+1050>:	xor    %r8,%rsi
   0x0000000000154ced <+1053>:	sub    %r8,%rsi
   0x0000000000154cf0 <+1056>:	add    %rsi,%rcx
   0x0000000000154cf3 <+1059>:	mov    %cl,0x232(%rdx)
   0x0000000000154cf9 <+1065>:	movsbq 0x233(%rdi),%rcx
   0x0000000000154d01 <+1073>:	mov    %rcx,%rsi
   0x0000000000154d04 <+1076>:	sar    $0x3f,%rsi
   0x0000000000154d08 <+1080>:	xor    %rsi,%rcx
   0x0000000000154d0b <+1083>:	sub    %rsi,%rcx
   0x0000000000154d0e <+1086>:	movsbq 0x233(%rax),%rsi
   0x0000000000154d16 <+1094>:	mov    %rsi,%r8
   0x0000000000154d19 <+1097>:	sar    $0x3f,%r8
   0x0000000000154d1d <+1101>:	xor    %r8,%rsi
   0x0000000000154d20 <+1104>:	sub    %r8,%rsi
   0x0000000000154d23 <+1107>:	add    %rsi,%rcx
   0x0000000000154d26 <+1110>:	mov    %cl,0x233(%rdx)
   0x0000000000154d2c <+1116>:	movsbq 0x234(%rdi),%rcx
   0x0000000000154d34 <+1124>:	mov    %rcx,%rsi
   0x0000000000154d37 <+1127>:	sar    $0x3f,%rsi
   0x0000000000154d3b <+1131>:	xor    %rsi,%rcx
   0x0000000000154d3e <+1134>:	sub    %rsi,%rcx
   0x0000000000154d41 <+1137>:	movsbq 0x234(%rax),%rsi
   0x0000000000154d49 <+1145>:	mov    %rsi,%r8
   0x0000000000154d4c <+1148>:	sar    $0x3f,%r8
   0x0000000000154d50 <+1152>:	xor    %r8,%rsi
   0x0000000000154d53 <+1155>:	sub    %r8,%rsi
   0x0000000000154d56 <+1158>:	add    %rsi,%rcx
   0x0000000000154d59 <+1161>:	mov    %cl,0x234(%rdx)
   0x0000000000154d5f <+1167>:	movsbq 0x235(%rax),%rcx
   0x0000000000154d67 <+1175>:	mov    %rcx,%rsi
   0x0000000000154d6a <+1178>:	sar    $0x3f,%rsi
   0x0000000000154d6e <+1182>:	xor    %rsi,%rcx
   0x0000000000154d71 <+1185>:	sub    %rsi,%rcx
   0x0000000000154d74 <+1188>:	movsbq 0x235(%rdi),%rsi
   0x0000000000154d7c <+1196>:	mov    %rsi,%r8
   0x0000000000154d7f <+1199>:	sar    $0x3f,%r8
   0x0000000000154d83 <+1203>:	xor    %r8,%rsi
   0x0000000000154d86 <+1206>:	sub    %r8,%rsi
   0x0000000000154d89 <+1209>:	add    %rsi,%rcx
   0x0000000000154d8c <+1212>:	mov    %cl,0x235(%rdx)
   0x0000000000154d92 <+1218>:	movsbq 0x236(%rdi),%rcx
   0x0000000000154d9a <+1226>:	mov    %rcx,%rsi
   0x0000000000154d9d <+1229>:	sar    $0x3f,%rsi
   0x0000000000154da1 <+1233>:	xor    %rsi,%rcx
   0x0000000000154da4 <+1236>:	sub    %rsi,%rcx
   0x0000000000154da7 <+1239>:	movsbq 0x236(%rax),%rsi
   0x0000000000154daf <+1247>:	mov    %rsi,%r8
   0x0000000000154db2 <+1250>:	sar    $0x3f,%r8
   0x0000000000154db6 <+1254>:	xor    %r8,%rsi
   0x0000000000154db9 <+1257>:	sub    %r8,%rsi
   0x0000000000154dbc <+1260>:	add    %rsi,%rcx
   0x0000000000154dbf <+1263>:	mov    %cl,0x236(%rdx)
   0x0000000000154dc5 <+1269>:	movsbq 0x237(%rax),%rax
   0x0000000000154dcd <+1277>:	mov    %rax,%rcx
   0x0000000000154dd0 <+1280>:	sar    $0x3f,%rcx
   0x0000000000154dd4 <+1284>:	xor    %rcx,%rax
   0x0000000000154dd7 <+1287>:	sub    %rcx,%rax
   0x0000000000154dda <+1290>:	movsbq 0x237(%rdi),%rcx
   0x0000000000154de2 <+1298>:	mov    %rcx,%rsi
   0x0000000000154de5 <+1301>:	sar    $0x3f,%rsi
   0x0000000000154de9 <+1305>:	xor    %rsi,%rcx
   0x0000000000154dec <+1308>:	sub    %rsi,%rcx
   0x0000000000154def <+1311>:	add    %rcx,%rax
   0x0000000000154df2 <+1314>:	mov    %al,0x237(%rdx)
   0x0000000000154df8 <+1320>:	retq   
   0x0000000000154df9 <+1321>:	nopl   0x0(%rax)
   0x0000000000154e00 <+1328>:	mov    %ecx,%eax
   0x0000000000154e02 <+1330>:	mov    %r8d,%r8d
   0x0000000000154e05 <+1333>:	mov    %edx,%edx
   0x0000000000154e07 <+1335>:	lea    0x22(%rax),%rcx
   0x0000000000154e0b <+1339>:	lea    0x22(%rdx),%r9
   0x0000000000154e0f <+1343>:	shl    $0x4,%rax
   0x0000000000154e13 <+1347>:	lea    (%rdi,%rax,1),%rax
   0x0000000000154e17 <+1351>:	shl    $0x4,%rdx
   0x0000000000154e1b <+1355>:	shl    $0x4,%rcx
   0x0000000000154e1f <+1359>:	shl    $0x4,%r9
   0x0000000000154e23 <+1363>:	add    %rdi,%rdx
   0x0000000000154e26 <+1366>:	movswq 0x8(%rdi,%rcx,1),%rsi
   0x0000000000154e2c <+1372>:	mov    %rsi,%rcx
   0x0000000000154e2f <+1375>:	sar    $0x3f,%rcx
   0x0000000000154e33 <+1379>:	xor    %rcx,%rsi
   0x0000000000154e36 <+1382>:	sub    %rcx,%rsi
   0x0000000000154e39 <+1385>:	lea    0x22(%r8),%rcx
   0x0000000000154e3d <+1389>:	shl    $0x4,%r8
   0x0000000000154e41 <+1393>:	shl    $0x4,%rcx
   0x0000000000154e45 <+1397>:	movswq 0x8(%rdi,%rcx,1),%rcx
   0x0000000000154e4b <+1403>:	mov    %rcx,%r10
   0x0000000000154e4e <+1406>:	sar    $0x3f,%r10
   0x0000000000154e52 <+1410>:	xor    %r10,%rcx
   0x0000000000154e55 <+1413>:	sub    %r10,%rcx
   0x0000000000154e58 <+1416>:	add    %rcx,%rsi
   0x0000000000154e5b <+1419>:	mov    %si,0x8(%rdi,%r9,1)
   0x0000000000154e61 <+1425>:	movswq 0x22a(%rax),%rcx
   0x0000000000154e69 <+1433>:	add    %r8,%rdi
   0x0000000000154e6c <+1436>:	mov    %rcx,%rsi
   0x0000000000154e6f <+1439>:	sar    $0x3f,%rsi
   0x0000000000154e73 <+1443>:	xor    %rsi,%rcx
   0x0000000000154e76 <+1446>:	sub    %rsi,%rcx
   0x0000000000154e79 <+1449>:	movswq 0x22a(%rdi),%rsi
   0x0000000000154e81 <+1457>:	mov    %rsi,%r8
   0x0000000000154e84 <+1460>:	sar    $0x3f,%r8
   0x0000000000154e88 <+1464>:	xor    %r8,%rsi
   0x0000000000154e8b <+1467>:	sub    %r8,%rsi
   0x0000000000154e8e <+1470>:	add    %rsi,%rcx
   0x0000000000154e91 <+1473>:	mov    %cx,0x22a(%rdx)
   0x0000000000154e98 <+1480>:	movswq 0x22c(%rax),%rcx
   0x0000000000154ea0 <+1488>:	mov    %rcx,%rsi
   0x0000000000154ea3 <+1491>:	sar    $0x3f,%rsi
   0x0000000000154ea7 <+1495>:	xor    %rsi,%rcx
   0x0000000000154eaa <+1498>:	sub    %rsi,%rcx
   0x0000000000154ead <+1501>:	movswq 0x22c(%rdi),%rsi
   0x0000000000154eb5 <+1509>:	mov    %rsi,%r8
   0x0000000000154eb8 <+1512>:	sar    $0x3f,%r8
   0x0000000000154ebc <+1516>:	xor    %r8,%rsi
   0x0000000000154ebf <+1519>:	sub    %r8,%rsi
   0x0000000000154ec2 <+1522>:	add    %rsi,%rcx
   0x0000000000154ec5 <+1525>:	mov    %cx,0x22c(%rdx)
   0x0000000000154ecc <+1532>:	movswq 0x22e(%rax),%rcx
   0x0000000000154ed4 <+1540>:	mov    %rcx,%rsi
   0x0000000000154ed7 <+1543>:	sar    $0x3f,%rsi
   0x0000000000154edb <+1547>:	xor    %rsi,%rcx
   0x0000000000154ede <+1550>:	sub    %rsi,%rcx
   0x0000000000154ee1 <+1553>:	movswq 0x22e(%rdi),%rsi
   0x0000000000154ee9 <+1561>:	mov    %rsi,%r8
   0x0000000000154eec <+1564>:	sar    $0x3f,%r8
   0x0000000000154ef0 <+1568>:	xor    %r8,%rsi
   0x0000000000154ef3 <+1571>:	sub    %r8,%rsi
   0x0000000000154ef6 <+1574>:	add    %rsi,%rcx
   0x0000000000154ef9 <+1577>:	mov    %cx,0x22e(%rdx)
   0x0000000000154f00 <+1584>:	movswq 0x230(%rax),%rcx
   0x0000000000154f08 <+1592>:	mov    %rcx,%rsi
   0x0000000000154f0b <+1595>:	sar    $0x3f,%rsi
   0x0000000000154f0f <+1599>:	xor    %rsi,%rcx
   0x0000000000154f12 <+1602>:	sub    %rsi,%rcx
   0x0000000000154f15 <+1605>:	movswq 0x230(%rdi),%rsi
   0x0000000000154f1d <+1613>:	mov    %rsi,%r8
   0x0000000000154f20 <+1616>:	sar    $0x3f,%r8
   0x0000000000154f24 <+1620>:	xor    %r8,%rsi
   0x0000000000154f27 <+1623>:	sub    %r8,%rsi
   0x0000000000154f2a <+1626>:	add    %rsi,%rcx
   0x0000000000154f2d <+1629>:	mov    %cx,0x230(%rdx)
   0x0000000000154f34 <+1636>:	movswq 0x232(%rax),%rcx
   0x0000000000154f3c <+1644>:	mov    %rcx,%rsi
   0x0000000000154f3f <+1647>:	sar    $0x3f,%rsi
   0x0000000000154f43 <+1651>:	xor    %rsi,%rcx
   0x0000000000154f46 <+1654>:	sub    %rsi,%rcx
   0x0000000000154f49 <+1657>:	movswq 0x232(%rdi),%rsi
   0x0000000000154f51 <+1665>:	mov    %rsi,%r8
   0x0000000000154f54 <+1668>:	sar    $0x3f,%r8
   0x0000000000154f58 <+1672>:	xor    %r8,%rsi
   0x0000000000154f5b <+1675>:	sub    %r8,%rsi
   0x0000000000154f5e <+1678>:	add    %rsi,%rcx
   0x0000000000154f61 <+1681>:	mov    %cx,0x232(%rdx)
   0x0000000000154f68 <+1688>:	movswq 0x234(%rax),%rcx
   0x0000000000154f70 <+1696>:	mov    %rcx,%rsi
   0x0000000000154f73 <+1699>:	sar    $0x3f,%rsi
   0x0000000000154f77 <+1703>:	xor    %rsi,%rcx
   0x0000000000154f7a <+1706>:	sub    %rsi,%rcx
   0x0000000000154f7d <+1709>:	movswq 0x234(%rdi),%rsi
   0x0000000000154f85 <+1717>:	mov    %rsi,%r8
   0x0000000000154f88 <+1720>:	sar    $0x3f,%r8
   0x0000000000154f8c <+1724>:	xor    %r8,%rsi
   0x0000000000154f8f <+1727>:	sub    %r8,%rsi
   0x0000000000154f92 <+1730>:	add    %rsi,%rcx
   0x0000000000154f95 <+1733>:	mov    %cx,0x234(%rdx)
   0x0000000000154f9c <+1740>:	movswq 0x236(%rax),%rax
   0x0000000000154fa4 <+1748>:	mov    %rax,%rcx
   0x0000000000154fa7 <+1751>:	sar    $0x3f,%rcx
   0x0000000000154fab <+1755>:	xor    %rcx,%rax
   0x0000000000154fae <+1758>:	sub    %rcx,%rax
   0x0000000000154fb1 <+1761>:	movswq 0x236(%rdi),%rcx
   0x0000000000154fb9 <+1769>:	mov    %rcx,%rsi
   0x0000000000154fbc <+1772>:	sar    $0x3f,%rsi
   0x0000000000154fc0 <+1776>:	xor    %rsi,%rcx
   0x0000000000154fc3 <+1779>:	sub    %rsi,%rcx
   0x0000000000154fc6 <+1782>:	add    %rcx,%rax
   0x0000000000154fc9 <+1785>:	mov    %ax,0x236(%rdx)
   0x0000000000154fd0 <+1792>:	retq   
   0x0000000000154fd1 <+1793>:	lea    0x14faa8(%rip),%rcx        # 0x2a4a80 <__PRETTY_FUNCTION__.25843>
   0x0000000000154fd8 <+1800>:	lea    0x14ef81(%rip),%rsi        # 0x2a3f60
   0x0000000000154fdf <+1807>:	lea    0x1da975(%rip),%rdi        # 0x32f95b
   0x0000000000154fe6 <+1814>:	sub    $0x8,%rsp
   0x0000000000154fea <+1818>:	mov    $0x368,%edx
   0x0000000000154fef <+1823>:	callq  0x8f170


> >
> > Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
> > ---
> >  target/mips/msa_helper.c | 43 ++++++++++++++++++++++++++++++-------------
> >  1 file changed, 30 insertions(+), 13 deletions(-)
> >
> > diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
> > index 4c7ec05..1152fda 100644
> > --- a/target/mips/msa_helper.c
> > +++ b/target/mips/msa_helper.c
> > @@ -804,28 +804,45 @@ void helper_msa_ ## func ## _df(CPUMIPSState *env, uint32_t df,         \
> >      wr_t *pwd = &(env->active_fpu.fpr[wd].wr);                          \
> >      wr_t *pws = &(env->active_fpu.fpr[ws].wr);                          \
> >      wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
> > \

> If we can ensure alignment for the various vector registers then the
> compiler always has the option of using host vectors (certainly for int
> and logic operations).

> > -    uint32_t i;                                                         \
> >                                                                          \

> >      switch (df) {                                                       \
> >      case DF_BYTE:                                                       \
> > -        for (i = 0; i < DF_ELEMENTS(DF_BYTE); i++) {                    \
> > -            pwd->b[i] = msa_ ## func ## _df(df, pws->b[i], pwt->b[i]);  \
> > -        }                                                               \
> > +        pwd->b[0]  = msa_ ## func ## _df(df, pws->b[0], pwt->b[0]);     \
> > +        pwd->b[1]  = msa_ ## func ## _df(df, pws->b[1], pwt->b[1]);     \
> > +        pwd->b[2]  = msa_ ## func ## _df(df, pws->b[2], pwt->b[2]);     \
> > +        pwd->b[3]  = msa_ ## func ## _df(df, pws->b[3], pwt->b[3]);     \
> > +        pwd->b[4]  = msa_ ## func ## _df(df, pws->b[4], pwt->b[4]);     \
> > +        pwd->b[5]  = msa_ ## func ## _df(df, pws->b[5], pwt->b[5]);     \
> > +        pwd->b[6]  = msa_ ## func ## _df(df, pws->b[6], pwt->b[6]);     \
> > +        pwd->b[7]  = msa_ ## func ## _df(df, pws->b[7], pwt->b[7]);     \
> > +        pwd->b[8]  = msa_ ## func ## _df(df, pws->b[8], pwt->b[8]);     \
> > +        pwd->b[9]  = msa_ ## func ## _df(df, pws->b[9], pwt->b[9]);     \
> > +        pwd->b[10] = msa_ ## func ## _df(df, pws->b[10], pwt->b[10]);   \
> > +        pwd->b[11] = msa_ ## func ## _df(df, pws->b[11], pwt->b[11]);   \
> > +        pwd->b[12] = msa_ ## func ## _df(df, pws->b[12], pwt->b[12]);   \
> > +        pwd->b[13] = msa_ ## func ## _df(df, pws->b[13], pwt->b[13]);   \
> > +        pwd->b[14] = msa_ ## func ## _df(df, pws->b[14], pwt->b[14]);   \
> > +        pwd->b[15] = msa_ ## func ## _df(df, pws->b[15], pwt->b[15]);   \
> >          break;                                                          \
> >      case DF_HALF:                                                       \
> > -        for (i = 0; i < DF_ELEMENTS(DF_HALF); i++) {                    \
> > -            pwd->h[i] = msa_ ## func ## _df(df, pws->h[i], pwt->h[i]);  \
> > -        }                                                               \
> > +        pwd->h[0] = msa_ ## func ## _df(df, pws->h[0], pwt->h[0]);      \
> > +        pwd->h[1] = msa_ ## func ## _df(df, pws->h[1], pwt->h[1]);      \
> > +        pwd->h[2] = msa_ ## func ## _df(df, pws->h[2], pwt->h[2]);      \
> > +        pwd->h[3] = msa_ ## func ## _df(df, pws->h[3], pwt->h[3]);      \
> > +        pwd->h[4] = msa_ ## func ## _df(df, pws->h[4], pwt->h[4]);      \
> > +        pwd->h[5] = msa_ ## func ## _df(df, pws->h[5], pwt->h[5]);      \
> > +        pwd->h[6] = msa_ ## func ## _df(df, pws->h[6], pwt->h[6]);      \
> > +        pwd->h[7] = msa_ ## func ## _df(df, pws->h[7], pwt->h[7]);      \
> >          break;                                                          \
> >      case DF_WORD:                                                       \
> > -        for (i = 0; i < DF_ELEMENTS(DF_WORD); i++) {                    \
> > -            pwd->w[i] = msa_ ## func ## _df(df, pws->w[i], pwt->w[i]);  \
> > -        }                                                               \
> > +        pwd->w[0] = msa_ ## func ## _df(df, pws->w[0], pwt->w[0]);      \
> > +        pwd->w[1] = msa_ ## func ## _df(df, pws->w[1], pwt->w[1]);      \
> > +        pwd->w[2] = msa_ ## func ## _df(df, pws->w[2], pwt->w[2]);      \
> > +        pwd->w[3] = msa_ ## func ## _df(df, pws->w[3], pwt->w[3]);      \
> >          break;                                                          \
> >      case DF_DOUBLE:                                                     \
> > -        for (i = 0; i < DF_ELEMENTS(DF_DOUBLE); i++) {                  \
> > -            pwd->d[i] = msa_ ## func ## _df(df, pws->d[i], pwt->d[i]);  \
> > -        }                                                               \
> > +        pwd->d[0] = msa_ ## func ## _df(df, pws->d[0], pwt->d[0]);      \
> > +        pwd->d[1] = msa_ ## func ## _df(df, pws->d[1], pwt->d[1]);      \
> >          break;                                                          \
> >      default:                                                            \
> >          assert(0);                                                      \


--
Alex Bennée
Mateja Marjanovic June 3, 2019, 1:29 p.m. UTC | #6
On 3.6.19. 15:10, Aleksandar Markovic wrote:
>> From: Alex Bennée <alex.bennee@linaro.org>
>> Sent: Sunday, June 2, 2019 3:22 PM
>> To: qemu-devel@nongnu.org
>> Cc: Aleksandar Rikalo; Aleksandar Markovic; aurelien@aurel32.net
>> Subject: Re: [Qemu-devel] [PATCH 1/2] target/mips: Improve performance for MSA binary operations
>
>> Mateja Marjanovic <mateja.marjanovic@rt-rk.com> writes:
>>> From: Mateja Marjanovic <Mateja.Marjanovic@rt-rk.com>
>>>
>>> Eliminate loops for better performance.
>> Have you done any measurements of the bellow loop unrolling? Because
>> this is something that maybe we can achieve and let the compiler make
>> the choice.
> I know that Mateja did extensive performance measurements, and I am
> asking him to give us some samples.
Yes, here are some of the binop instructions with a loop and without
it:

|| intruction   ||       with-loop     ||    without-loop   ||
===============================
||  addv.b      ||    147.356 ms     ||    84.249 ms     ||
||  addv.h      ||     83.612 ms      ||    44.808 ms     ||
||  addv.w      ||     49.952 ms     ||    34.128 ms     ||
||  addv.d      ||     29.834 ms      ||    34.103 ms     ||
||  asub_s.b   ||    187.939 ms     ||    143.658 ms   ||
||  asub_s.h   ||    105.266 ms     ||     82.424 ms    ||
||  asub_s.w  ||     68.941 ms      ||     57.923 ms    ||
||  asub_s.d   ||     41.536 ms      ||     42.092 ms    ||
||  max_s.b    ||    156.036 ms    ||    100.628 ms    ||
||  max_s.h    ||     87.100 ms     ||     64.905 ms     ||
||  max_s.w   ||     52.339 ms     ||     40.632 ms     ||
||  max_s.d    ||     34.873 ms     ||     35.562 ms     ||
===============================

>
> As for code generation, here are disassemblies of function
> helper_msa_add_a_df() before and after this patch:
>
> (it is visible the compiler did not perform unrolling loops by itself)
>
> BEFORE:
>
> Dump of assembler code for function helper_msa_add_a_df:
>     0x00000000001500b0 <+0>:	cmp    $0x1,%esi
>     0x00000000001500b3 <+3>:	je     0x150258 <helper_msa_add_a_df+424>
>     0x00000000001500b9 <+9>:	jb     0x1501e8 <helper_msa_add_a_df+312>
>     0x00000000001500bf <+15>:	cmp    $0x2,%esi
>     0x00000000001500c2 <+18>:	je     0x150180 <helper_msa_add_a_df+208>
>     0x00000000001500c8 <+24>:	cmp    $0x3,%esi
>     0x00000000001500cb <+27>:	jne    0x1502c2 <helper_msa_add_a_df+530>
>     0x00000000001500d1 <+33>:	mov    %ecx,%ecx
>     0x00000000001500d3 <+35>:	mov    %edx,%edx
>     0x00000000001500d5 <+37>:	lea    0x22(%rcx),%rax
>     0x00000000001500d9 <+41>:	lea    0x22(%rdx),%r10
>     0x00000000001500dd <+45>:	shl    $0x4,%rcx
>     0x00000000001500e1 <+49>:	add    %rdi,%rcx
>     0x00000000001500e4 <+52>:	shl    $0x4,%rdx
>     0x00000000001500e8 <+56>:	shl    $0x4,%rax
>     0x00000000001500ec <+60>:	shl    $0x4,%r10
>     0x00000000001500f0 <+64>:	add    %rdi,%rax
>     0x00000000001500f3 <+67>:	mov    0x8(%rax),%r9
>     0x00000000001500f7 <+71>:	mov    0x8(%rax),%rsi
>     0x00000000001500fb <+75>:	mov    %r8d,%eax
>     0x00000000001500fe <+78>:	sar    $0x3f,%r9
>     0x0000000000150102 <+82>:	xor    %r9,%rsi
>     0x0000000000150105 <+85>:	sub    %r9,%rsi
>     0x0000000000150108 <+88>:	mov    %rsi,%r9
>     0x000000000015010b <+91>:	lea    0x22(%rax),%rsi
>     0x000000000015010f <+95>:	shl    $0x4,%rax
>     0x0000000000150113 <+99>:	lea    (%rdi,%rax,1),%rax
>     0x0000000000150117 <+103>:	shl    $0x4,%rsi
>     0x000000000015011b <+107>:	add    %rdi,%rsi
>     0x000000000015011e <+110>:	mov    0x8(%rsi),%r8
>     0x0000000000150122 <+114>:	mov    0x8(%rsi),%r11
>     0x0000000000150126 <+118>:	sar    $0x3f,%r8
>     0x000000000015012a <+122>:	xor    %r8,%r11
>     0x000000000015012d <+125>:	mov    %r11,%rsi
>     0x0000000000150130 <+128>:	sub    %r8,%rsi
>     0x0000000000150133 <+131>:	add    %r9,%rsi
>     0x0000000000150136 <+134>:	mov    %rsi,0x8(%rdi,%r10,1)
>     0x000000000015013b <+139>:	mov    0x230(%rcx),%rsi
>     0x0000000000150142 <+146>:	mov    0x230(%rcx),%r8
>     0x0000000000150149 <+153>:	sar    $0x3f,%rsi
>     0x000000000015014d <+157>:	xor    %rsi,%r8
>     0x0000000000150150 <+160>:	mov    %r8,%rcx
>     0x0000000000150153 <+163>:	mov    0x230(%rax),%r8
>     0x000000000015015a <+170>:	sub    %rsi,%rcx
>     0x000000000015015d <+173>:	mov    0x230(%rax),%rsi
>     0x0000000000150164 <+180>:	sar    $0x3f,%rsi
>     0x0000000000150168 <+184>:	xor    %rsi,%r8
>     0x000000000015016b <+187>:	mov    %r8,%rax
>     0x000000000015016e <+190>:	sub    %rsi,%rax
>     0x0000000000150171 <+193>:	add    %rcx,%rax
>     0x0000000000150174 <+196>:	mov    %rax,0x230(%rdi,%rdx,1)
>     0x000000000015017c <+204>:	retq
>     0x000000000015017d <+205>:	nopl   (%rax)
>     0x0000000000150180 <+208>:	mov    %r8d,%r8d
>     0x0000000000150183 <+211>:	mov    %ecx,%ecx
>     0x0000000000150185 <+213>:	mov    %edx,%edx
>     0x0000000000150187 <+215>:	mov    %r8,%rax
>     0x000000000015018a <+218>:	neg    %r8
>     0x000000000015018d <+221>:	shl    $0x4,%rcx
>     0x0000000000150191 <+225>:	shl    $0x4,%rax
>     0x0000000000150195 <+229>:	shl    $0x4,%r8
>     0x0000000000150199 <+233>:	shl    $0x4,%rdx
>     0x000000000015019d <+237>:	lea    0x228(%rdi,%rax,1),%r9
>     0x00000000001501a5 <+245>:	lea    0x238(%rdi,%rax,1),%rdi
>     0x00000000001501ad <+253>:	lea    (%r9,%r8,1),%r10
>     0x00000000001501b1 <+257>:	add    $0x4,%r9
>     0x00000000001501b5 <+261>:	movslq (%r10,%rcx,1),%rax
>     0x00000000001501b9 <+265>:	mov    %rax,%rsi
>     0x00000000001501bc <+268>:	sar    $0x3f,%rsi
>     0x00000000001501c0 <+272>:	xor    %rsi,%rax
>     0x00000000001501c3 <+275>:	sub    %rsi,%rax
>     0x00000000001501c6 <+278>:	movslq -0x4(%r9),%rsi
>     0x00000000001501ca <+282>:	mov    %rsi,%r11
>     0x00000000001501cd <+285>:	sar    $0x3f,%r11
>     0x00000000001501d1 <+289>:	xor    %r11,%rsi
>     0x00000000001501d4 <+292>:	sub    %r11,%rsi
>     0x00000000001501d7 <+295>:	add    %rsi,%rax
>     0x00000000001501da <+298>:	cmp    %rdi,%r9
>     0x00000000001501dd <+301>:	mov    %eax,(%r10,%rdx,1)
>     0x00000000001501e1 <+305>:	jne    0x1501ad <helper_msa_add_a_df+253>
>     0x00000000001501e3 <+307>:	repz retq
>     0x00000000001501e5 <+309>:	nopl   (%rax)
>     0x00000000001501e8 <+312>:	mov    %r8d,%r8d
>     0x00000000001501eb <+315>:	mov    %ecx,%ecx
>     0x00000000001501ed <+317>:	mov    %edx,%edx
>     0x00000000001501ef <+319>:	mov    %r8,%rax
>     0x00000000001501f2 <+322>:	neg    %r8
>     0x00000000001501f5 <+325>:	shl    $0x4,%rcx
>     0x00000000001501f9 <+329>:	shl    $0x4,%rax
>     0x00000000001501fd <+333>:	shl    $0x4,%r8
>     0x0000000000150201 <+337>:	shl    $0x4,%rdx
>     0x0000000000150205 <+341>:	lea    0x228(%rdi,%rax,1),%r9
>     0x000000000015020d <+349>:	lea    0x238(%rdi,%rax,1),%r11
>     0x0000000000150215 <+357>:	nopl   (%rax)
>     0x0000000000150218 <+360>:	lea    (%r8,%r9,1),%rdi
>     0x000000000015021c <+364>:	add    $0x1,%r9
>     0x0000000000150220 <+368>:	movsbq (%rdi,%rcx,1),%rax
>     0x0000000000150225 <+373>:	mov    %rax,%rsi
>     0x0000000000150228 <+376>:	sar    $0x3f,%rsi
>     0x000000000015022c <+380>:	xor    %rsi,%rax
>     0x000000000015022f <+383>:	sub    %rsi,%rax
>     0x0000000000150232 <+386>:	movsbq -0x1(%r9),%rsi
>     0x0000000000150237 <+391>:	mov    %rsi,%r10
>     0x000000000015023a <+394>:	sar    $0x3f,%r10
>     0x000000000015023e <+398>:	xor    %r10,%rsi
>     0x0000000000150241 <+401>:	sub    %r10,%rsi
>     0x0000000000150244 <+404>:	add    %rsi,%rax
>     0x0000000000150247 <+407>:	cmp    %r9,%r11
>     0x000000000015024a <+410>:	mov    %al,(%rdi,%rdx,1)
>     0x000000000015024d <+413>:	jne    0x150218 <helper_msa_add_a_df+360>
>     0x000000000015024f <+415>:	repz retq
>     0x0000000000150251 <+417>:	nopl   0x0(%rax)
>     0x0000000000150258 <+424>:	mov    %r8d,%r8d
>     0x000000000015025b <+427>:	mov    %ecx,%ecx
>     0x000000000015025d <+429>:	mov    %edx,%edx
>     0x000000000015025f <+431>:	mov    %r8,%rax
>     0x0000000000150262 <+434>:	neg    %r8
>     0x0000000000150265 <+437>:	shl    $0x4,%rcx
>     0x0000000000150269 <+441>:	shl    $0x4,%rax
>     0x000000000015026d <+445>:	shl    $0x4,%r8
>     0x0000000000150271 <+449>:	shl    $0x4,%rdx
>     0x0000000000150275 <+453>:	lea    0x228(%rdi,%rax,1),%r9
>     0x000000000015027d <+461>:	lea    0x238(%rdi,%rax,1),%r10
>     0x0000000000150285 <+469>:	nopl   (%rax)
>     0x0000000000150288 <+472>:	lea    (%r8,%r9,1),%rdi
>     0x000000000015028c <+476>:	add    $0x2,%r9
>     0x0000000000150290 <+480>:	movswq (%rdi,%rcx,1),%rax
>     0x0000000000150295 <+485>:	mov    %rax,%rsi
>     0x0000000000150298 <+488>:	sar    $0x3f,%rsi
>     0x000000000015029c <+492>:	xor    %rsi,%rax
>     0x000000000015029f <+495>:	sub    %rsi,%rax
>     0x00000000001502a2 <+498>:	movswq -0x2(%r9),%rsi
>     0x00000000001502a7 <+503>:	mov    %rsi,%r11
>     0x00000000001502aa <+506>:	sar    $0x3f,%r11
>     0x00000000001502ae <+510>:	xor    %r11,%rsi
>     0x00000000001502b1 <+513>:	sub    %r11,%rsi
>     0x00000000001502b4 <+516>:	add    %rsi,%rax
>     0x00000000001502b7 <+519>:	cmp    %r10,%r9
>     0x00000000001502ba <+522>:	mov    %ax,(%rdi,%rdx,1)
>     0x00000000001502be <+526>:	jne    0x150288 <helper_msa_add_a_df+472>
>     0x00000000001502c0 <+528>:	repz retq
>     0x00000000001502c2 <+530>:	lea    0x13c3b7(%rip),%rcx        # 0x28c680 <__PRETTY_FUNCTION__.26062>
>     0x00000000001502c9 <+537>:	lea    0x13b830(%rip),%rsi        # 0x28bb00
>     0x00000000001502d0 <+544>:	lea    0x1c7204(%rip),%rdi        # 0x3174db
>     0x00000000001502d7 <+551>:	sub    $0x8,%rsp
>     0x00000000001502db <+555>:	mov    $0x357,%edx
>     0x00000000001502e0 <+560>:	callq  0x8eeb8
> End of assembler dump.
>
>
> AFTER:
>
>     0x00000000001548d0 <+0>:	cmp    $0x1,%esi
>     0x00000000001548d3 <+3>:	je     0x154e00 <helper_msa_add_a_df+1328>
>     0x00000000001548d9 <+9>:	jb     0x154a98 <helper_msa_add_a_df+456>
>     0x00000000001548df <+15>:	cmp    $0x2,%esi
>     0x00000000001548e2 <+18>:	je     0x1549a0 <helper_msa_add_a_df+208>
>     0x00000000001548e8 <+24>:	cmp    $0x3,%esi
>     0x00000000001548eb <+27>:	jne    0x154fd1 <helper_msa_add_a_df+1793>
>     0x00000000001548f1 <+33>:	mov    %ecx,%eax
>     0x00000000001548f3 <+35>:	mov    %r8d,%r8d
>     0x00000000001548f6 <+38>:	mov    %edx,%edx
>     0x00000000001548f8 <+40>:	lea    0x22(%rax),%rcx
>     0x00000000001548fc <+44>:	lea    0x22(%rdx),%r9
>     0x0000000000154900 <+48>:	shl    $0x4,%rax
>     0x0000000000154904 <+52>:	add    %rdi,%rax
>     0x0000000000154907 <+55>:	shl    $0x4,%rdx
>     0x000000000015490b <+59>:	shl    $0x4,%rcx
>     0x000000000015490f <+63>:	shl    $0x4,%r9
>     0x0000000000154913 <+67>:	add    %rdi,%rcx
>     0x0000000000154916 <+70>:	mov    0x8(%rcx),%rsi
>     0x000000000015491a <+74>:	mov    0x8(%rcx),%r11
>     0x000000000015491e <+78>:	sar    $0x3f,%rsi
>     0x0000000000154922 <+82>:	xor    %rsi,%r11
>     0x0000000000154925 <+85>:	mov    %r11,%rcx
>     0x0000000000154928 <+88>:	sub    %rsi,%rcx
>     0x000000000015492b <+91>:	mov    %rcx,%rsi
>     0x000000000015492e <+94>:	lea    0x22(%r8),%rcx
>     0x0000000000154932 <+98>:	shl    $0x4,%r8
>     0x0000000000154936 <+102>:	add    %rdi,%r8
>     0x0000000000154939 <+105>:	shl    $0x4,%rcx
>     0x000000000015493d <+109>:	add    %rdi,%rcx
>     0x0000000000154940 <+112>:	mov    0x8(%rcx),%r10
>     0x0000000000154944 <+116>:	mov    0x8(%rcx),%r11
>     0x0000000000154948 <+120>:	sar    $0x3f,%r10
>     0x000000000015494c <+124>:	xor    %r10,%r11
>     0x000000000015494f <+127>:	mov    %r11,%rcx
>     0x0000000000154952 <+130>:	sub    %r10,%rcx
>     0x0000000000154955 <+133>:	add    %rsi,%rcx
>     0x0000000000154958 <+136>:	mov    %rcx,0x8(%rdi,%r9,1)
>     0x000000000015495d <+141>:	mov    0x230(%rax),%rcx
>     0x0000000000154964 <+148>:	mov    0x230(%rax),%rsi
>     0x000000000015496b <+155>:	sar    $0x3f,%rcx
>     0x000000000015496f <+159>:	xor    %rcx,%rsi
>     0x0000000000154972 <+162>:	mov    %rsi,%rax
>     0x0000000000154975 <+165>:	mov    0x230(%r8),%rsi
>     0x000000000015497c <+172>:	sub    %rcx,%rax
>     0x000000000015497f <+175>:	mov    %rax,%rcx
>     0x0000000000154982 <+178>:	mov    0x230(%r8),%rax
>     0x0000000000154989 <+185>:	sar    $0x3f,%rsi
>     0x000000000015498d <+189>:	xor    %rsi,%rax
>     0x0000000000154990 <+192>:	sub    %rsi,%rax
>     0x0000000000154993 <+195>:	add    %rcx,%rax
>     0x0000000000154996 <+198>:	mov    %rax,0x230(%rdi,%rdx,1)
>     0x000000000015499e <+206>:	retq
>     0x000000000015499f <+207>:	nop
>     0x00000000001549a0 <+208>:	mov    %ecx,%ecx
>     0x00000000001549a2 <+210>:	mov    %r8d,%r8d
>     0x00000000001549a5 <+213>:	mov    %edx,%edx
>     0x00000000001549a7 <+215>:	lea    0x22(%rcx),%rax
>     0x00000000001549ab <+219>:	lea    0x22(%rdx),%r9
>     0x00000000001549af <+223>:	shl    $0x4,%rcx
>     0x00000000001549b3 <+227>:	add    %rdi,%rcx
>     0x00000000001549b6 <+230>:	shl    $0x4,%rdx
>     0x00000000001549ba <+234>:	shl    $0x4,%rax
>     0x00000000001549be <+238>:	shl    $0x4,%r9
>     0x00000000001549c2 <+242>:	add    %rdi,%rdx
>     0x00000000001549c5 <+245>:	movslq 0x8(%rdi,%rax,1),%rax
>     0x00000000001549ca <+250>:	mov    %rax,%rsi
>     0x00000000001549cd <+253>:	sar    $0x3f,%rsi
>     0x00000000001549d1 <+257>:	xor    %rsi,%rax
>     0x00000000001549d4 <+260>:	sub    %rsi,%rax
>     0x00000000001549d7 <+263>:	lea    0x22(%r8),%rsi
>     0x00000000001549db <+267>:	shl    $0x4,%r8
>     0x00000000001549df <+271>:	shl    $0x4,%rsi
>     0x00000000001549e3 <+275>:	movslq 0x8(%rdi,%rsi,1),%rsi
>     0x00000000001549e8 <+280>:	mov    %rsi,%r10
>     0x00000000001549eb <+283>:	sar    $0x3f,%r10
>     0x00000000001549ef <+287>:	xor    %r10,%rsi
>     0x00000000001549f2 <+290>:	sub    %r10,%rsi
>     0x00000000001549f5 <+293>:	add    %rsi,%rax
>     0x00000000001549f8 <+296>:	mov    %eax,0x8(%rdi,%r9,1)
>     0x00000000001549fd <+301>:	movslq 0x22c(%rcx),%rax
>     0x0000000000154a04 <+308>:	add    %r8,%rdi
>     0x0000000000154a07 <+311>:	mov    %rax,%rsi
>     0x0000000000154a0a <+314>:	sar    $0x3f,%rsi
>     0x0000000000154a0e <+318>:	xor    %rsi,%rax
>     0x0000000000154a11 <+321>:	sub    %rsi,%rax
>     0x0000000000154a14 <+324>:	movslq 0x22c(%rdi),%rsi
>     0x0000000000154a1b <+331>:	mov    %rsi,%r8
>     0x0000000000154a1e <+334>:	sar    $0x3f,%r8
>     0x0000000000154a22 <+338>:	xor    %r8,%rsi
>     0x0000000000154a25 <+341>:	sub    %r8,%rsi
>     0x0000000000154a28 <+344>:	add    %rsi,%rax
>     0x0000000000154a2b <+347>:	mov    %eax,0x22c(%rdx)
>     0x0000000000154a31 <+353>:	movslq 0x230(%rcx),%rax
>     0x0000000000154a38 <+360>:	mov    %rax,%rsi
>     0x0000000000154a3b <+363>:	sar    $0x3f,%rsi
>     0x0000000000154a3f <+367>:	xor    %rsi,%rax
>     0x0000000000154a42 <+370>:	sub    %rsi,%rax
>     0x0000000000154a45 <+373>:	movslq 0x230(%rdi),%rsi
>     0x0000000000154a4c <+380>:	mov    %rsi,%r8
>     0x0000000000154a4f <+383>:	sar    $0x3f,%r8
>     0x0000000000154a53 <+387>:	xor    %r8,%rsi
>     0x0000000000154a56 <+390>:	sub    %r8,%rsi
>     0x0000000000154a59 <+393>:	add    %rsi,%rax
>     0x0000000000154a5c <+396>:	mov    %eax,0x230(%rdx)
>     0x0000000000154a62 <+402>:	movslq 0x234(%rcx),%rax
>     0x0000000000154a69 <+409>:	mov    %rax,%rcx
>     0x0000000000154a6c <+412>:	sar    $0x3f,%rcx
>     0x0000000000154a70 <+416>:	xor    %rcx,%rax
>     0x0000000000154a73 <+419>:	sub    %rcx,%rax
>     0x0000000000154a76 <+422>:	movslq 0x234(%rdi),%rcx
>     0x0000000000154a7d <+429>:	mov    %rcx,%rsi
>     0x0000000000154a80 <+432>:	sar    $0x3f,%rsi
>     0x0000000000154a84 <+436>:	xor    %rsi,%rcx
>     0x0000000000154a87 <+439>:	sub    %rsi,%rcx
>     0x0000000000154a8a <+442>:	add    %rcx,%rax
>     0x0000000000154a8d <+445>:	mov    %eax,0x234(%rdx)
>     0x0000000000154a93 <+451>:	retq
>     0x0000000000154a94 <+452>:	nopl   0x0(%rax)
>     0x0000000000154a98 <+456>:	mov    %ecx,%eax
>     0x0000000000154a9a <+458>:	mov    %r8d,%r8d
>     0x0000000000154a9d <+461>:	mov    %edx,%edx
>     0x0000000000154a9f <+463>:	lea    0x22(%rax),%rcx
>     0x0000000000154aa3 <+467>:	lea    0x22(%rdx),%r9
>     0x0000000000154aa7 <+471>:	shl    $0x4,%rax
>     0x0000000000154aab <+475>:	lea    (%rdi,%rax,1),%rax
>     0x0000000000154aaf <+479>:	shl    $0x4,%rdx
>     0x0000000000154ab3 <+483>:	shl    $0x4,%rcx
>     0x0000000000154ab7 <+487>:	shl    $0x4,%r9
>     0x0000000000154abb <+491>:	add    %rdi,%rdx
>     0x0000000000154abe <+494>:	movsbq 0x8(%rdi,%rcx,1),%rsi
>     0x0000000000154ac4 <+500>:	mov    %rsi,%rcx
>     0x0000000000154ac7 <+503>:	sar    $0x3f,%rcx
>     0x0000000000154acb <+507>:	xor    %rcx,%rsi
>     0x0000000000154ace <+510>:	sub    %rcx,%rsi
>     0x0000000000154ad1 <+513>:	lea    0x22(%r8),%rcx
>     0x0000000000154ad5 <+517>:	shl    $0x4,%r8
>     0x0000000000154ad9 <+521>:	shl    $0x4,%rcx
>     0x0000000000154add <+525>:	movsbq 0x8(%rdi,%rcx,1),%rcx
>     0x0000000000154ae3 <+531>:	mov    %rcx,%r10
>     0x0000000000154ae6 <+534>:	sar    $0x3f,%r10
>     0x0000000000154aea <+538>:	xor    %r10,%rcx
>     0x0000000000154aed <+541>:	sub    %r10,%rcx
>     0x0000000000154af0 <+544>:	add    %rcx,%rsi
>     0x0000000000154af3 <+547>:	mov    %sil,0x8(%rdi,%r9,1)
>     0x0000000000154af8 <+552>:	movsbq 0x229(%rax),%rcx
>     0x0000000000154b00 <+560>:	add    %r8,%rdi
>     0x0000000000154b03 <+563>:	mov    %rcx,%rsi
>     0x0000000000154b06 <+566>:	sar    $0x3f,%rsi
>     0x0000000000154b0a <+570>:	xor    %rsi,%rcx
>     0x0000000000154b0d <+573>:	sub    %rsi,%rcx
>     0x0000000000154b10 <+576>:	movsbq 0x229(%rdi),%rsi
>     0x0000000000154b18 <+584>:	mov    %rsi,%r8
>     0x0000000000154b1b <+587>:	sar    $0x3f,%r8
>     0x0000000000154b1f <+591>:	xor    %r8,%rsi
>     0x0000000000154b22 <+594>:	sub    %r8,%rsi
>     0x0000000000154b25 <+597>:	add    %rsi,%rcx
>     0x0000000000154b28 <+600>:	mov    %cl,0x229(%rdx)
>     0x0000000000154b2e <+606>:	movsbq 0x22a(%rax),%rcx
>     0x0000000000154b36 <+614>:	mov    %rcx,%rsi
>     0x0000000000154b39 <+617>:	sar    $0x3f,%rsi
>     0x0000000000154b3d <+621>:	xor    %rsi,%rcx
>     0x0000000000154b40 <+624>:	sub    %rsi,%rcx
>     0x0000000000154b43 <+627>:	movsbq 0x22a(%rdi),%rsi
>     0x0000000000154b4b <+635>:	mov    %rsi,%r8
>     0x0000000000154b4e <+638>:	sar    $0x3f,%r8
>     0x0000000000154b52 <+642>:	xor    %r8,%rsi
>     0x0000000000154b55 <+645>:	sub    %r8,%rsi
>     0x0000000000154b58 <+648>:	add    %rsi,%rcx
>     0x0000000000154b5b <+651>:	mov    %cl,0x22a(%rdx)
>     0x0000000000154b61 <+657>:	movsbq 0x22b(%rax),%rcx
>     0x0000000000154b69 <+665>:	mov    %rcx,%rsi
>     0x0000000000154b6c <+668>:	sar    $0x3f,%rsi
>     0x0000000000154b70 <+672>:	xor    %rsi,%rcx
>     0x0000000000154b73 <+675>:	sub    %rsi,%rcx
>     0x0000000000154b76 <+678>:	movsbq 0x22b(%rdi),%rsi
>     0x0000000000154b7e <+686>:	mov    %rsi,%r8
>     0x0000000000154b81 <+689>:	sar    $0x3f,%r8
>     0x0000000000154b85 <+693>:	xor    %r8,%rsi
>     0x0000000000154b88 <+696>:	sub    %r8,%rsi
>     0x0000000000154b8b <+699>:	add    %rsi,%rcx
>     0x0000000000154b8e <+702>:	mov    %cl,0x22b(%rdx)
>     0x0000000000154b94 <+708>:	movsbq 0x22c(%rax),%rcx
>     0x0000000000154b9c <+716>:	mov    %rcx,%rsi
>     0x0000000000154b9f <+719>:	sar    $0x3f,%rsi
>     0x0000000000154ba3 <+723>:	xor    %rsi,%rcx
>     0x0000000000154ba6 <+726>:	sub    %rsi,%rcx
>     0x0000000000154ba9 <+729>:	movsbq 0x22c(%rdi),%rsi
>     0x0000000000154bb1 <+737>:	mov    %rsi,%r8
>     0x0000000000154bb4 <+740>:	sar    $0x3f,%r8
>     0x0000000000154bb8 <+744>:	xor    %r8,%rsi
>     0x0000000000154bbb <+747>:	sub    %r8,%rsi
>     0x0000000000154bbe <+750>:	add    %rsi,%rcx
>     0x0000000000154bc1 <+753>:	mov    %cl,0x22c(%rdx)
>     0x0000000000154bc7 <+759>:	movsbq 0x22d(%rax),%rcx
>     0x0000000000154bcf <+767>:	mov    %rcx,%rsi
>     0x0000000000154bd2 <+770>:	sar    $0x3f,%rsi
>     0x0000000000154bd6 <+774>:	xor    %rsi,%rcx
>     0x0000000000154bd9 <+777>:	sub    %rsi,%rcx
>     0x0000000000154bdc <+780>:	movsbq 0x22d(%rdi),%rsi
>     0x0000000000154be4 <+788>:	mov    %rsi,%r8
>     0x0000000000154be7 <+791>:	sar    $0x3f,%r8
>     0x0000000000154beb <+795>:	xor    %r8,%rsi
>     0x0000000000154bee <+798>:	sub    %r8,%rsi
>     0x0000000000154bf1 <+801>:	add    %rsi,%rcx
>     0x0000000000154bf4 <+804>:	mov    %cl,0x22d(%rdx)
>     0x0000000000154bfa <+810>:	movsbq 0x22e(%rax),%rcx
>     0x0000000000154c02 <+818>:	mov    %rcx,%rsi
>     0x0000000000154c05 <+821>:	sar    $0x3f,%rsi
>     0x0000000000154c09 <+825>:	xor    %rsi,%rcx
>     0x0000000000154c0c <+828>:	sub    %rsi,%rcx
>     0x0000000000154c0f <+831>:	movsbq 0x22e(%rdi),%rsi
>     0x0000000000154c17 <+839>:	mov    %rsi,%r8
>     0x0000000000154c1a <+842>:	sar    $0x3f,%r8
>     0x0000000000154c1e <+846>:	xor    %r8,%rsi
>     0x0000000000154c21 <+849>:	sub    %r8,%rsi
>     0x0000000000154c24 <+852>:	add    %rsi,%rcx
>     0x0000000000154c27 <+855>:	mov    %cl,0x22e(%rdx)
>     0x0000000000154c2d <+861>:	movsbq 0x22f(%rax),%rcx
>     0x0000000000154c35 <+869>:	mov    %rcx,%rsi
>     0x0000000000154c38 <+872>:	sar    $0x3f,%rsi
>     0x0000000000154c3c <+876>:	xor    %rsi,%rcx
>     0x0000000000154c3f <+879>:	sub    %rsi,%rcx
>     0x0000000000154c42 <+882>:	movsbq 0x22f(%rdi),%rsi
>     0x0000000000154c4a <+890>:	mov    %rsi,%r8
>     0x0000000000154c4d <+893>:	sar    $0x3f,%r8
>     0x0000000000154c51 <+897>:	xor    %r8,%rsi
>     0x0000000000154c54 <+900>:	sub    %r8,%rsi
>     0x0000000000154c57 <+903>:	add    %rsi,%rcx
>     0x0000000000154c5a <+906>:	mov    %cl,0x22f(%rdx)
>     0x0000000000154c60 <+912>:	movsbq 0x230(%rax),%rcx
>     0x0000000000154c68 <+920>:	mov    %rcx,%rsi
>     0x0000000000154c6b <+923>:	sar    $0x3f,%rsi
>     0x0000000000154c6f <+927>:	xor    %rsi,%rcx
>     0x0000000000154c72 <+930>:	sub    %rsi,%rcx
>     0x0000000000154c75 <+933>:	movsbq 0x230(%rdi),%rsi
>     0x0000000000154c7d <+941>:	mov    %rsi,%r8
>     0x0000000000154c80 <+944>:	sar    $0x3f,%r8
>     0x0000000000154c84 <+948>:	xor    %r8,%rsi
>     0x0000000000154c87 <+951>:	sub    %r8,%rsi
>     0x0000000000154c8a <+954>:	add    %rsi,%rcx
>     0x0000000000154c8d <+957>:	mov    %cl,0x230(%rdx)
>     0x0000000000154c93 <+963>:	movsbq 0x231(%rax),%rcx
>     0x0000000000154c9b <+971>:	mov    %rcx,%rsi
>     0x0000000000154c9e <+974>:	sar    $0x3f,%rsi
>     0x0000000000154ca2 <+978>:	xor    %rsi,%rcx
>     0x0000000000154ca5 <+981>:	sub    %rsi,%rcx
>     0x0000000000154ca8 <+984>:	movsbq 0x231(%rdi),%rsi
>     0x0000000000154cb0 <+992>:	mov    %rsi,%r8
>     0x0000000000154cb3 <+995>:	sar    $0x3f,%r8
>     0x0000000000154cb7 <+999>:	xor    %r8,%rsi
>     0x0000000000154cba <+1002>:	sub    %r8,%rsi
>     0x0000000000154cbd <+1005>:	add    %rsi,%rcx
>     0x0000000000154cc0 <+1008>:	mov    %cl,0x231(%rdx)
>     0x0000000000154cc6 <+1014>:	movsbq 0x232(%rax),%rcx
>     0x0000000000154cce <+1022>:	mov    %rcx,%rsi
>     0x0000000000154cd1 <+1025>:	sar    $0x3f,%rsi
>     0x0000000000154cd5 <+1029>:	xor    %rsi,%rcx
>     0x0000000000154cd8 <+1032>:	sub    %rsi,%rcx
>     0x0000000000154cdb <+1035>:	movsbq 0x232(%rdi),%rsi
>     0x0000000000154ce3 <+1043>:	mov    %rsi,%r8
>     0x0000000000154ce6 <+1046>:	sar    $0x3f,%r8
>     0x0000000000154cea <+1050>:	xor    %r8,%rsi
>     0x0000000000154ced <+1053>:	sub    %r8,%rsi
>     0x0000000000154cf0 <+1056>:	add    %rsi,%rcx
>     0x0000000000154cf3 <+1059>:	mov    %cl,0x232(%rdx)
>     0x0000000000154cf9 <+1065>:	movsbq 0x233(%rdi),%rcx
>     0x0000000000154d01 <+1073>:	mov    %rcx,%rsi
>     0x0000000000154d04 <+1076>:	sar    $0x3f,%rsi
>     0x0000000000154d08 <+1080>:	xor    %rsi,%rcx
>     0x0000000000154d0b <+1083>:	sub    %rsi,%rcx
>     0x0000000000154d0e <+1086>:	movsbq 0x233(%rax),%rsi
>     0x0000000000154d16 <+1094>:	mov    %rsi,%r8
>     0x0000000000154d19 <+1097>:	sar    $0x3f,%r8
>     0x0000000000154d1d <+1101>:	xor    %r8,%rsi
>     0x0000000000154d20 <+1104>:	sub    %r8,%rsi
>     0x0000000000154d23 <+1107>:	add    %rsi,%rcx
>     0x0000000000154d26 <+1110>:	mov    %cl,0x233(%rdx)
>     0x0000000000154d2c <+1116>:	movsbq 0x234(%rdi),%rcx
>     0x0000000000154d34 <+1124>:	mov    %rcx,%rsi
>     0x0000000000154d37 <+1127>:	sar    $0x3f,%rsi
>     0x0000000000154d3b <+1131>:	xor    %rsi,%rcx
>     0x0000000000154d3e <+1134>:	sub    %rsi,%rcx
>     0x0000000000154d41 <+1137>:	movsbq 0x234(%rax),%rsi
>     0x0000000000154d49 <+1145>:	mov    %rsi,%r8
>     0x0000000000154d4c <+1148>:	sar    $0x3f,%r8
>     0x0000000000154d50 <+1152>:	xor    %r8,%rsi
>     0x0000000000154d53 <+1155>:	sub    %r8,%rsi
>     0x0000000000154d56 <+1158>:	add    %rsi,%rcx
>     0x0000000000154d59 <+1161>:	mov    %cl,0x234(%rdx)
>     0x0000000000154d5f <+1167>:	movsbq 0x235(%rax),%rcx
>     0x0000000000154d67 <+1175>:	mov    %rcx,%rsi
>     0x0000000000154d6a <+1178>:	sar    $0x3f,%rsi
>     0x0000000000154d6e <+1182>:	xor    %rsi,%rcx
>     0x0000000000154d71 <+1185>:	sub    %rsi,%rcx
>     0x0000000000154d74 <+1188>:	movsbq 0x235(%rdi),%rsi
>     0x0000000000154d7c <+1196>:	mov    %rsi,%r8
>     0x0000000000154d7f <+1199>:	sar    $0x3f,%r8
>     0x0000000000154d83 <+1203>:	xor    %r8,%rsi
>     0x0000000000154d86 <+1206>:	sub    %r8,%rsi
>     0x0000000000154d89 <+1209>:	add    %rsi,%rcx
>     0x0000000000154d8c <+1212>:	mov    %cl,0x235(%rdx)
>     0x0000000000154d92 <+1218>:	movsbq 0x236(%rdi),%rcx
>     0x0000000000154d9a <+1226>:	mov    %rcx,%rsi
>     0x0000000000154d9d <+1229>:	sar    $0x3f,%rsi
>     0x0000000000154da1 <+1233>:	xor    %rsi,%rcx
>     0x0000000000154da4 <+1236>:	sub    %rsi,%rcx
>     0x0000000000154da7 <+1239>:	movsbq 0x236(%rax),%rsi
>     0x0000000000154daf <+1247>:	mov    %rsi,%r8
>     0x0000000000154db2 <+1250>:	sar    $0x3f,%r8
>     0x0000000000154db6 <+1254>:	xor    %r8,%rsi
>     0x0000000000154db9 <+1257>:	sub    %r8,%rsi
>     0x0000000000154dbc <+1260>:	add    %rsi,%rcx
>     0x0000000000154dbf <+1263>:	mov    %cl,0x236(%rdx)
>     0x0000000000154dc5 <+1269>:	movsbq 0x237(%rax),%rax
>     0x0000000000154dcd <+1277>:	mov    %rax,%rcx
>     0x0000000000154dd0 <+1280>:	sar    $0x3f,%rcx
>     0x0000000000154dd4 <+1284>:	xor    %rcx,%rax
>     0x0000000000154dd7 <+1287>:	sub    %rcx,%rax
>     0x0000000000154dda <+1290>:	movsbq 0x237(%rdi),%rcx
>     0x0000000000154de2 <+1298>:	mov    %rcx,%rsi
>     0x0000000000154de5 <+1301>:	sar    $0x3f,%rsi
>     0x0000000000154de9 <+1305>:	xor    %rsi,%rcx
>     0x0000000000154dec <+1308>:	sub    %rsi,%rcx
>     0x0000000000154def <+1311>:	add    %rcx,%rax
>     0x0000000000154df2 <+1314>:	mov    %al,0x237(%rdx)
>     0x0000000000154df8 <+1320>:	retq
>     0x0000000000154df9 <+1321>:	nopl   0x0(%rax)
>     0x0000000000154e00 <+1328>:	mov    %ecx,%eax
>     0x0000000000154e02 <+1330>:	mov    %r8d,%r8d
>     0x0000000000154e05 <+1333>:	mov    %edx,%edx
>     0x0000000000154e07 <+1335>:	lea    0x22(%rax),%rcx
>     0x0000000000154e0b <+1339>:	lea    0x22(%rdx),%r9
>     0x0000000000154e0f <+1343>:	shl    $0x4,%rax
>     0x0000000000154e13 <+1347>:	lea    (%rdi,%rax,1),%rax
>     0x0000000000154e17 <+1351>:	shl    $0x4,%rdx
>     0x0000000000154e1b <+1355>:	shl    $0x4,%rcx
>     0x0000000000154e1f <+1359>:	shl    $0x4,%r9
>     0x0000000000154e23 <+1363>:	add    %rdi,%rdx
>     0x0000000000154e26 <+1366>:	movswq 0x8(%rdi,%rcx,1),%rsi
>     0x0000000000154e2c <+1372>:	mov    %rsi,%rcx
>     0x0000000000154e2f <+1375>:	sar    $0x3f,%rcx
>     0x0000000000154e33 <+1379>:	xor    %rcx,%rsi
>     0x0000000000154e36 <+1382>:	sub    %rcx,%rsi
>     0x0000000000154e39 <+1385>:	lea    0x22(%r8),%rcx
>     0x0000000000154e3d <+1389>:	shl    $0x4,%r8
>     0x0000000000154e41 <+1393>:	shl    $0x4,%rcx
>     0x0000000000154e45 <+1397>:	movswq 0x8(%rdi,%rcx,1),%rcx
>     0x0000000000154e4b <+1403>:	mov    %rcx,%r10
>     0x0000000000154e4e <+1406>:	sar    $0x3f,%r10
>     0x0000000000154e52 <+1410>:	xor    %r10,%rcx
>     0x0000000000154e55 <+1413>:	sub    %r10,%rcx
>     0x0000000000154e58 <+1416>:	add    %rcx,%rsi
>     0x0000000000154e5b <+1419>:	mov    %si,0x8(%rdi,%r9,1)
>     0x0000000000154e61 <+1425>:	movswq 0x22a(%rax),%rcx
>     0x0000000000154e69 <+1433>:	add    %r8,%rdi
>     0x0000000000154e6c <+1436>:	mov    %rcx,%rsi
>     0x0000000000154e6f <+1439>:	sar    $0x3f,%rsi
>     0x0000000000154e73 <+1443>:	xor    %rsi,%rcx
>     0x0000000000154e76 <+1446>:	sub    %rsi,%rcx
>     0x0000000000154e79 <+1449>:	movswq 0x22a(%rdi),%rsi
>     0x0000000000154e81 <+1457>:	mov    %rsi,%r8
>     0x0000000000154e84 <+1460>:	sar    $0x3f,%r8
>     0x0000000000154e88 <+1464>:	xor    %r8,%rsi
>     0x0000000000154e8b <+1467>:	sub    %r8,%rsi
>     0x0000000000154e8e <+1470>:	add    %rsi,%rcx
>     0x0000000000154e91 <+1473>:	mov    %cx,0x22a(%rdx)
>     0x0000000000154e98 <+1480>:	movswq 0x22c(%rax),%rcx
>     0x0000000000154ea0 <+1488>:	mov    %rcx,%rsi
>     0x0000000000154ea3 <+1491>:	sar    $0x3f,%rsi
>     0x0000000000154ea7 <+1495>:	xor    %rsi,%rcx
>     0x0000000000154eaa <+1498>:	sub    %rsi,%rcx
>     0x0000000000154ead <+1501>:	movswq 0x22c(%rdi),%rsi
>     0x0000000000154eb5 <+1509>:	mov    %rsi,%r8
>     0x0000000000154eb8 <+1512>:	sar    $0x3f,%r8
>     0x0000000000154ebc <+1516>:	xor    %r8,%rsi
>     0x0000000000154ebf <+1519>:	sub    %r8,%rsi
>     0x0000000000154ec2 <+1522>:	add    %rsi,%rcx
>     0x0000000000154ec5 <+1525>:	mov    %cx,0x22c(%rdx)
>     0x0000000000154ecc <+1532>:	movswq 0x22e(%rax),%rcx
>     0x0000000000154ed4 <+1540>:	mov    %rcx,%rsi
>     0x0000000000154ed7 <+1543>:	sar    $0x3f,%rsi
>     0x0000000000154edb <+1547>:	xor    %rsi,%rcx
>     0x0000000000154ede <+1550>:	sub    %rsi,%rcx
>     0x0000000000154ee1 <+1553>:	movswq 0x22e(%rdi),%rsi
>     0x0000000000154ee9 <+1561>:	mov    %rsi,%r8
>     0x0000000000154eec <+1564>:	sar    $0x3f,%r8
>     0x0000000000154ef0 <+1568>:	xor    %r8,%rsi
>     0x0000000000154ef3 <+1571>:	sub    %r8,%rsi
>     0x0000000000154ef6 <+1574>:	add    %rsi,%rcx
>     0x0000000000154ef9 <+1577>:	mov    %cx,0x22e(%rdx)
>     0x0000000000154f00 <+1584>:	movswq 0x230(%rax),%rcx
>     0x0000000000154f08 <+1592>:	mov    %rcx,%rsi
>     0x0000000000154f0b <+1595>:	sar    $0x3f,%rsi
>     0x0000000000154f0f <+1599>:	xor    %rsi,%rcx
>     0x0000000000154f12 <+1602>:	sub    %rsi,%rcx
>     0x0000000000154f15 <+1605>:	movswq 0x230(%rdi),%rsi
>     0x0000000000154f1d <+1613>:	mov    %rsi,%r8
>     0x0000000000154f20 <+1616>:	sar    $0x3f,%r8
>     0x0000000000154f24 <+1620>:	xor    %r8,%rsi
>     0x0000000000154f27 <+1623>:	sub    %r8,%rsi
>     0x0000000000154f2a <+1626>:	add    %rsi,%rcx
>     0x0000000000154f2d <+1629>:	mov    %cx,0x230(%rdx)
>     0x0000000000154f34 <+1636>:	movswq 0x232(%rax),%rcx
>     0x0000000000154f3c <+1644>:	mov    %rcx,%rsi
>     0x0000000000154f3f <+1647>:	sar    $0x3f,%rsi
>     0x0000000000154f43 <+1651>:	xor    %rsi,%rcx
>     0x0000000000154f46 <+1654>:	sub    %rsi,%rcx
>     0x0000000000154f49 <+1657>:	movswq 0x232(%rdi),%rsi
>     0x0000000000154f51 <+1665>:	mov    %rsi,%r8
>     0x0000000000154f54 <+1668>:	sar    $0x3f,%r8
>     0x0000000000154f58 <+1672>:	xor    %r8,%rsi
>     0x0000000000154f5b <+1675>:	sub    %r8,%rsi
>     0x0000000000154f5e <+1678>:	add    %rsi,%rcx
>     0x0000000000154f61 <+1681>:	mov    %cx,0x232(%rdx)
>     0x0000000000154f68 <+1688>:	movswq 0x234(%rax),%rcx
>     0x0000000000154f70 <+1696>:	mov    %rcx,%rsi
>     0x0000000000154f73 <+1699>:	sar    $0x3f,%rsi
>     0x0000000000154f77 <+1703>:	xor    %rsi,%rcx
>     0x0000000000154f7a <+1706>:	sub    %rsi,%rcx
>     0x0000000000154f7d <+1709>:	movswq 0x234(%rdi),%rsi
>     0x0000000000154f85 <+1717>:	mov    %rsi,%r8
>     0x0000000000154f88 <+1720>:	sar    $0x3f,%r8
>     0x0000000000154f8c <+1724>:	xor    %r8,%rsi
>     0x0000000000154f8f <+1727>:	sub    %r8,%rsi
>     0x0000000000154f92 <+1730>:	add    %rsi,%rcx
>     0x0000000000154f95 <+1733>:	mov    %cx,0x234(%rdx)
>     0x0000000000154f9c <+1740>:	movswq 0x236(%rax),%rax
>     0x0000000000154fa4 <+1748>:	mov    %rax,%rcx
>     0x0000000000154fa7 <+1751>:	sar    $0x3f,%rcx
>     0x0000000000154fab <+1755>:	xor    %rcx,%rax
>     0x0000000000154fae <+1758>:	sub    %rcx,%rax
>     0x0000000000154fb1 <+1761>:	movswq 0x236(%rdi),%rcx
>     0x0000000000154fb9 <+1769>:	mov    %rcx,%rsi
>     0x0000000000154fbc <+1772>:	sar    $0x3f,%rsi
>     0x0000000000154fc0 <+1776>:	xor    %rsi,%rcx
>     0x0000000000154fc3 <+1779>:	sub    %rsi,%rcx
>     0x0000000000154fc6 <+1782>:	add    %rcx,%rax
>     0x0000000000154fc9 <+1785>:	mov    %ax,0x236(%rdx)
>     0x0000000000154fd0 <+1792>:	retq
>     0x0000000000154fd1 <+1793>:	lea    0x14faa8(%rip),%rcx        # 0x2a4a80 <__PRETTY_FUNCTION__.25843>
>     0x0000000000154fd8 <+1800>:	lea    0x14ef81(%rip),%rsi        # 0x2a3f60
>     0x0000000000154fdf <+1807>:	lea    0x1da975(%rip),%rdi        # 0x32f95b
>     0x0000000000154fe6 <+1814>:	sub    $0x8,%rsp
>     0x0000000000154fea <+1818>:	mov    $0x368,%edx
>     0x0000000000154fef <+1823>:	callq  0x8f170
>
>
>>> Signed-off-by: Mateja Marjanovic <mateja.marjanovic@rt-rk.com>
>>> ---
>>>   target/mips/msa_helper.c | 43 ++++++++++++++++++++++++++++++-------------
>>>   1 file changed, 30 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
>>> index 4c7ec05..1152fda 100644
>>> --- a/target/mips/msa_helper.c
>>> +++ b/target/mips/msa_helper.c
>>> @@ -804,28 +804,45 @@ void helper_msa_ ## func ## _df(CPUMIPSState *env, uint32_t df,         \
>>>       wr_t *pwd = &(env->active_fpu.fpr[wd].wr);                          \
>>>       wr_t *pws = &(env->active_fpu.fpr[ws].wr);                          \
>>>       wr_t *pwt = &(env->active_fpu.fpr[wt].wr);
>>> \
>> If we can ensure alignment for the various vector registers then the
>> compiler always has the option of using host vectors (certainly for int
>> and logic operations).
Very interesting, could you please tell me more about that, so I can
understand better.

Thanks,
Mateja
>>> -    uint32_t i;                                                         \
>>>                                                                           \
>>>       switch (df) {                                                       \
>>>       case DF_BYTE:                                                       \
>>> -        for (i = 0; i < DF_ELEMENTS(DF_BYTE); i++) {                    \
>>> -            pwd->b[i] = msa_ ## func ## _df(df, pws->b[i], pwt->b[i]);  \
>>> -        }                                                               \
>>> +        pwd->b[0]  = msa_ ## func ## _df(df, pws->b[0], pwt->b[0]);     \
>>> +        pwd->b[1]  = msa_ ## func ## _df(df, pws->b[1], pwt->b[1]);     \
>>> +        pwd->b[2]  = msa_ ## func ## _df(df, pws->b[2], pwt->b[2]);     \
>>> +        pwd->b[3]  = msa_ ## func ## _df(df, pws->b[3], pwt->b[3]);     \
>>> +        pwd->b[4]  = msa_ ## func ## _df(df, pws->b[4], pwt->b[4]);     \
>>> +        pwd->b[5]  = msa_ ## func ## _df(df, pws->b[5], pwt->b[5]);     \
>>> +        pwd->b[6]  = msa_ ## func ## _df(df, pws->b[6], pwt->b[6]);     \
>>> +        pwd->b[7]  = msa_ ## func ## _df(df, pws->b[7], pwt->b[7]);     \
>>> +        pwd->b[8]  = msa_ ## func ## _df(df, pws->b[8], pwt->b[8]);     \
>>> +        pwd->b[9]  = msa_ ## func ## _df(df, pws->b[9], pwt->b[9]);     \
>>> +        pwd->b[10] = msa_ ## func ## _df(df, pws->b[10], pwt->b[10]);   \
>>> +        pwd->b[11] = msa_ ## func ## _df(df, pws->b[11], pwt->b[11]);   \
>>> +        pwd->b[12] = msa_ ## func ## _df(df, pws->b[12], pwt->b[12]);   \
>>> +        pwd->b[13] = msa_ ## func ## _df(df, pws->b[13], pwt->b[13]);   \
>>> +        pwd->b[14] = msa_ ## func ## _df(df, pws->b[14], pwt->b[14]);   \
>>> +        pwd->b[15] = msa_ ## func ## _df(df, pws->b[15], pwt->b[15]);   \
>>>           break;                                                          \
>>>       case DF_HALF:                                                       \
>>> -        for (i = 0; i < DF_ELEMENTS(DF_HALF); i++) {                    \
>>> -            pwd->h[i] = msa_ ## func ## _df(df, pws->h[i], pwt->h[i]);  \
>>> -        }                                                               \
>>> +        pwd->h[0] = msa_ ## func ## _df(df, pws->h[0], pwt->h[0]);      \
>>> +        pwd->h[1] = msa_ ## func ## _df(df, pws->h[1], pwt->h[1]);      \
>>> +        pwd->h[2] = msa_ ## func ## _df(df, pws->h[2], pwt->h[2]);      \
>>> +        pwd->h[3] = msa_ ## func ## _df(df, pws->h[3], pwt->h[3]);      \
>>> +        pwd->h[4] = msa_ ## func ## _df(df, pws->h[4], pwt->h[4]);      \
>>> +        pwd->h[5] = msa_ ## func ## _df(df, pws->h[5], pwt->h[5]);      \
>>> +        pwd->h[6] = msa_ ## func ## _df(df, pws->h[6], pwt->h[6]);      \
>>> +        pwd->h[7] = msa_ ## func ## _df(df, pws->h[7], pwt->h[7]);      \
>>>           break;                                                          \
>>>       case DF_WORD:                                                       \
>>> -        for (i = 0; i < DF_ELEMENTS(DF_WORD); i++) {                    \
>>> -            pwd->w[i] = msa_ ## func ## _df(df, pws->w[i], pwt->w[i]);  \
>>> -        }                                                               \
>>> +        pwd->w[0] = msa_ ## func ## _df(df, pws->w[0], pwt->w[0]);      \
>>> +        pwd->w[1] = msa_ ## func ## _df(df, pws->w[1], pwt->w[1]);      \
>>> +        pwd->w[2] = msa_ ## func ## _df(df, pws->w[2], pwt->w[2]);      \
>>> +        pwd->w[3] = msa_ ## func ## _df(df, pws->w[3], pwt->w[3]);      \
>>>           break;                                                          \
>>>       case DF_DOUBLE:                                                     \
>>> -        for (i = 0; i < DF_ELEMENTS(DF_DOUBLE); i++) {                  \
>>> -            pwd->d[i] = msa_ ## func ## _df(df, pws->d[i], pwt->d[i]);  \
>>> -        }                                                               \
>>> +        pwd->d[0] = msa_ ## func ## _df(df, pws->d[0], pwt->d[0]);      \
>>> +        pwd->d[1] = msa_ ## func ## _df(df, pws->d[1], pwt->d[1]);      \
>>>           break;                                                          \
>>>       default:                                                            \
>>>           assert(0);                                                      \
>
> --
> Alex Bennée
>
diff mbox series

Patch

diff --git a/target/mips/msa_helper.c b/target/mips/msa_helper.c
index 4c7ec05..1152fda 100644
--- a/target/mips/msa_helper.c
+++ b/target/mips/msa_helper.c
@@ -804,28 +804,45 @@  void helper_msa_ ## func ## _df(CPUMIPSState *env, uint32_t df,         \
     wr_t *pwd = &(env->active_fpu.fpr[wd].wr);                          \
     wr_t *pws = &(env->active_fpu.fpr[ws].wr);                          \
     wr_t *pwt = &(env->active_fpu.fpr[wt].wr);                          \
-    uint32_t i;                                                         \
                                                                         \
     switch (df) {                                                       \
     case DF_BYTE:                                                       \
-        for (i = 0; i < DF_ELEMENTS(DF_BYTE); i++) {                    \
-            pwd->b[i] = msa_ ## func ## _df(df, pws->b[i], pwt->b[i]);  \
-        }                                                               \
+        pwd->b[0]  = msa_ ## func ## _df(df, pws->b[0], pwt->b[0]);     \
+        pwd->b[1]  = msa_ ## func ## _df(df, pws->b[1], pwt->b[1]);     \
+        pwd->b[2]  = msa_ ## func ## _df(df, pws->b[2], pwt->b[2]);     \
+        pwd->b[3]  = msa_ ## func ## _df(df, pws->b[3], pwt->b[3]);     \
+        pwd->b[4]  = msa_ ## func ## _df(df, pws->b[4], pwt->b[4]);     \
+        pwd->b[5]  = msa_ ## func ## _df(df, pws->b[5], pwt->b[5]);     \
+        pwd->b[6]  = msa_ ## func ## _df(df, pws->b[6], pwt->b[6]);     \
+        pwd->b[7]  = msa_ ## func ## _df(df, pws->b[7], pwt->b[7]);     \
+        pwd->b[8]  = msa_ ## func ## _df(df, pws->b[8], pwt->b[8]);     \
+        pwd->b[9]  = msa_ ## func ## _df(df, pws->b[9], pwt->b[9]);     \
+        pwd->b[10] = msa_ ## func ## _df(df, pws->b[10], pwt->b[10]);   \
+        pwd->b[11] = msa_ ## func ## _df(df, pws->b[11], pwt->b[11]);   \
+        pwd->b[12] = msa_ ## func ## _df(df, pws->b[12], pwt->b[12]);   \
+        pwd->b[13] = msa_ ## func ## _df(df, pws->b[13], pwt->b[13]);   \
+        pwd->b[14] = msa_ ## func ## _df(df, pws->b[14], pwt->b[14]);   \
+        pwd->b[15] = msa_ ## func ## _df(df, pws->b[15], pwt->b[15]);   \
         break;                                                          \
     case DF_HALF:                                                       \
-        for (i = 0; i < DF_ELEMENTS(DF_HALF); i++) {                    \
-            pwd->h[i] = msa_ ## func ## _df(df, pws->h[i], pwt->h[i]);  \
-        }                                                               \
+        pwd->h[0] = msa_ ## func ## _df(df, pws->h[0], pwt->h[0]);      \
+        pwd->h[1] = msa_ ## func ## _df(df, pws->h[1], pwt->h[1]);      \
+        pwd->h[2] = msa_ ## func ## _df(df, pws->h[2], pwt->h[2]);      \
+        pwd->h[3] = msa_ ## func ## _df(df, pws->h[3], pwt->h[3]);      \
+        pwd->h[4] = msa_ ## func ## _df(df, pws->h[4], pwt->h[4]);      \
+        pwd->h[5] = msa_ ## func ## _df(df, pws->h[5], pwt->h[5]);      \
+        pwd->h[6] = msa_ ## func ## _df(df, pws->h[6], pwt->h[6]);      \
+        pwd->h[7] = msa_ ## func ## _df(df, pws->h[7], pwt->h[7]);      \
         break;                                                          \
     case DF_WORD:                                                       \
-        for (i = 0; i < DF_ELEMENTS(DF_WORD); i++) {                    \
-            pwd->w[i] = msa_ ## func ## _df(df, pws->w[i], pwt->w[i]);  \
-        }                                                               \
+        pwd->w[0] = msa_ ## func ## _df(df, pws->w[0], pwt->w[0]);      \
+        pwd->w[1] = msa_ ## func ## _df(df, pws->w[1], pwt->w[1]);      \
+        pwd->w[2] = msa_ ## func ## _df(df, pws->w[2], pwt->w[2]);      \
+        pwd->w[3] = msa_ ## func ## _df(df, pws->w[3], pwt->w[3]);      \
         break;                                                          \
     case DF_DOUBLE:                                                     \
-        for (i = 0; i < DF_ELEMENTS(DF_DOUBLE); i++) {                  \
-            pwd->d[i] = msa_ ## func ## _df(df, pws->d[i], pwt->d[i]);  \
-        }                                                               \
+        pwd->d[0] = msa_ ## func ## _df(df, pws->d[0], pwt->d[0]);      \
+        pwd->d[1] = msa_ ## func ## _df(df, pws->d[1], pwt->d[1]);      \
         break;                                                          \
     default:                                                            \
         assert(0);                                                      \