Message ID | 1475041518-9757-3-git-send-email-raji@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 09/27/2016 10:45 PM, Rajalakshmi Srinivasaraghavan wrote: > +#if defined(HOST_WORDS_BIGENDIAN) > +#define VEXTULX_DO(name, elem) \ > +target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b) \ > +{ \ > + target_ulong r = 0; \ > + int i; \ > + int index = a & 0xf; \ > + for (i = 0; i < elem; i++) { \ > + r = r << 8; \ > + if (index + i <= 15) { \ > + r = r | b->u8[index + i]; \ > + } \ > + } \ > + return r; \ > +} > +#else > +#define VEXTULX_DO(name, elem) \ > +target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b) \ > +{ \ > + target_ulong r = 0; \ > + int i; \ > + int index = 15 - (a & 0xf); \ > + for (i = 0; i < elem; i++) { \ > + r = r << 8; \ > + if (index - i >= 0) { \ > + r = r | b->u8[index - i]; \ > + } \ > + } \ > + return r; \ > +} > +#endif > + > +VEXTULX_DO(vextublx, 1) > +VEXTULX_DO(vextuhlx, 2) > +VEXTULX_DO(vextuwlx, 4) > +#undef VEXTULX_DO Ew. This should be one 128-bit shift and one and. Since the shift amount is a multiple of 8, the 128-bit shift for vextub[lr]x does not need to cross a double-word boundary, and so can be decomposed into one 64-bit shift of (count & 64 ? hi : lo). For vextu[hw]lr]x, you'd need to do the whole left-shift, right-shift, or thing. But still, fantastically better than a loop. r~
On 09/28/2016 10:24 PM, Richard Henderson wrote: > On 09/27/2016 10:45 PM, Rajalakshmi Srinivasaraghavan wrote: >> +#if defined(HOST_WORDS_BIGENDIAN) >> +#define VEXTULX_DO(name, elem) \ >> +target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b) \ >> +{ \ >> + target_ulong r = 0; \ >> + int i; \ >> + int index = a & 0xf; \ >> + for (i = 0; i < elem; i++) { \ >> + r = r << 8; \ >> + if (index + i <= 15) { \ >> + r = r | b->u8[index + i]; \ >> + } \ >> + } \ >> + return r; \ >> +} >> +#else >> +#define VEXTULX_DO(name, elem) \ >> +target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b) \ >> +{ \ >> + target_ulong r = 0; \ >> + int i; \ >> + int index = 15 - (a & 0xf); \ >> + for (i = 0; i < elem; i++) { \ >> + r = r << 8; \ >> + if (index - i >= 0) { \ >> + r = r | b->u8[index - i]; \ >> + } \ >> + } \ >> + return r; \ >> +} >> +#endif >> + >> +VEXTULX_DO(vextublx, 1) >> +VEXTULX_DO(vextuhlx, 2) >> +VEXTULX_DO(vextuwlx, 4) >> +#undef VEXTULX_DO > Ew. > > This should be one 128-bit shift and one and. > > Since the shift amount is a multiple of 8, the 128-bit shift for vextub[lr]x > does not need to cross a double-word boundary, and so can be decomposed into > one 64-bit shift of (count & 64 ? hi : lo). > > For vextu[hw]lr]x, you'd need to do the whole left-shift, right-shift, or thing. > > But still, fantastically better than a loop. Ack. Will send an updated patch. > > > r~ > >
diff --git a/target-ppc/helper.h b/target-ppc/helper.h index a1c2962..3041199 100644 --- a/target-ppc/helper.h +++ b/target-ppc/helper.h @@ -344,6 +344,9 @@ DEF_HELPER_3(vpmsumb, void, avr, avr, avr) DEF_HELPER_3(vpmsumh, void, avr, avr, avr) DEF_HELPER_3(vpmsumw, void, avr, avr, avr) DEF_HELPER_3(vpmsumd, void, avr, avr, avr) +DEF_HELPER_2(vextublx, tl, tl, avr) +DEF_HELPER_2(vextuhlx, tl, tl, avr) +DEF_HELPER_2(vextuwlx, tl, tl, avr) DEF_HELPER_2(vsbox, void, avr, avr) DEF_HELPER_3(vcipher, void, avr, avr, avr) diff --git a/target-ppc/int_helper.c b/target-ppc/int_helper.c index 51a9ac5..c24cc07 100644 --- a/target-ppc/int_helper.c +++ b/target-ppc/int_helper.c @@ -1705,6 +1705,43 @@ void helper_vlogefp(CPUPPCState *env, ppc_avr_t *r, ppc_avr_t *b) } } +#if defined(HOST_WORDS_BIGENDIAN) +#define VEXTULX_DO(name, elem) \ +target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b) \ +{ \ + target_ulong r = 0; \ + int i; \ + int index = a & 0xf; \ + for (i = 0; i < elem; i++) { \ + r = r << 8; \ + if (index + i <= 15) { \ + r = r | b->u8[index + i]; \ + } \ + } \ + return r; \ +} +#else +#define VEXTULX_DO(name, elem) \ +target_ulong glue(helper_, name)(target_ulong a, ppc_avr_t *b) \ +{ \ + target_ulong r = 0; \ + int i; \ + int index = 15 - (a & 0xf); \ + for (i = 0; i < elem; i++) { \ + r = r << 8; \ + if (index - i >= 0) { \ + r = r | b->u8[index - i]; \ + } \ + } \ + return r; \ +} +#endif + +VEXTULX_DO(vextublx, 1) +VEXTULX_DO(vextuhlx, 2) +VEXTULX_DO(vextuwlx, 4) +#undef VEXTULX_DO + /* The specification says that the results are undefined if all of the * shift counts are not identical. We check to make sure that they are * to conform to what real hardware appears to do. */ diff --git a/target-ppc/translate/vmx-impl.inc.c b/target-ppc/translate/vmx-impl.inc.c index abfde27..815ba96 100644 --- a/target-ppc/translate/vmx-impl.inc.c +++ b/target-ppc/translate/vmx-impl.inc.c @@ -342,6 +342,19 @@ static void glue(gen_, name0##_##name1)(DisasContext *ctx) \ } \ } +#define GEN_VXFORM_HETRO(name, opc2, opc3) \ +static void glue(gen_, name)(DisasContext *ctx) \ +{ \ + TCGv_ptr rb; \ + if (unlikely(!ctx->altivec_enabled)) { \ + gen_exception(ctx, POWERPC_EXCP_VPU); \ + return; \ + } \ + rb = gen_avr_ptr(rB(ctx->opcode)); \ + gen_helper_##name(cpu_gpr[rD(ctx->opcode)], cpu_gpr[rA(ctx->opcode)], rb); \ + tcg_temp_free_ptr(rb); \ +} + GEN_VXFORM(vaddubm, 0, 0); GEN_VXFORM_DUAL_EXT(vaddubm, PPC_NONE, PPC2_ALTIVEC_207, 0, \ vmul10cuq, PPC_NONE, PPC2_ISA300, 0x0000F800) @@ -516,6 +529,12 @@ GEN_VXFORM_ENV(vsubfp, 5, 1); GEN_VXFORM_ENV(vmaxfp, 5, 16); GEN_VXFORM_ENV(vminfp, 5, 17); +GEN_VXFORM_HETRO(vextublx, 6, 24) +GEN_VXFORM_HETRO(vextuhlx, 6, 25) +GEN_VXFORM_HETRO(vextuwlx, 6, 26) +GEN_VXFORM_DUAL(vmrgow, PPC_NONE, PPC2_ALTIVEC_207, + vextuwlx, PPC_NONE, PPC2_ISA300) + #define GEN_VXRFORM1(opname, name, str, opc2, opc3) \ static void glue(gen_, name)(DisasContext *ctx) \ { \ diff --git a/target-ppc/translate/vmx-ops.inc.c b/target-ppc/translate/vmx-ops.inc.c index 5d47b0f..3e0047d 100644 --- a/target-ppc/translate/vmx-ops.inc.c +++ b/target-ppc/translate/vmx-ops.inc.c @@ -91,8 +91,10 @@ GEN_VXFORM(vmrghw, 6, 2), GEN_VXFORM(vmrglb, 6, 4), GEN_VXFORM(vmrglh, 6, 5), GEN_VXFORM(vmrglw, 6, 6), +GEN_VXFORM_300(vextublx, 6, 24), +GEN_VXFORM_300(vextuhlx, 6, 25), +GEN_VXFORM_DUAL(vmrgow, vextuwlx, 6, 26, PPC_ALTIVEC, PPC_NONE), GEN_VXFORM_207(vmrgew, 6, 30), -GEN_VXFORM_207(vmrgow, 6, 26), GEN_VXFORM(vmuloub, 4, 0), GEN_VXFORM(vmulouh, 4, 1), GEN_VXFORM_DUAL(vmulouw, vmuluwm, 4, 2, PPC_ALTIVEC, PPC_NONE),