Message ID | 1475041518-9757-2-git-send-email-raji@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 09/27/2016 10:45 PM, Rajalakshmi Srinivasaraghavan wrote: > + val = tcg_const_i64(10); \ Rename this "ten" for clarity? > + z = tcg_const_i64(0); \ > + \ > + if (add_cin) { \ > + tcg_gen_andi_i64(cin, cpu_avrl[rB(ctx->opcode)], 0xF); \ > + tcg_gen_movcond_i64(TCG_COND_LTU, cin, cin, val, cin, z); \ What is the purpose of this movcond? The docs specifically say that values greater than 9 are undefined. > + } else { \ > + tcg_gen_movi_i64(cin, 0); \ > + } \ > + \ > + tcg_gen_mulu2_i64(t0, t1, cpu_avrl[rA(ctx->opcode)], val); \ > + tcg_gen_add2_i64(cpu_avrl[rD(ctx->opcode)], t2, t0, z, cin, z); \ > + tcg_gen_add2_i64(t2, t0, t1, z, t2, z); \ This two additions are unused if !add_cin, and the second appears to be mergable with the first -- don't use so many z's. I think this simplifies to if (add_cin) { tcg_gen_mulu2_i64(t0, t1, cpu_avrl[rA(ctx->opcode)], ten); tcg_gen_andi_i64(t2, cpu_avrl[rB(ctx->opcode)], 0xF); tcg_gen_add2_i64(cpu_avrl[rD(ctx->opcode)], t2, t0, t1, t2, z); } else { tcg_gen_mulu2_i64(cpu_avrl[rD(ctx->opcode)], t2, cpu_avrl[rA(ctx->opcode)], ten); } > + tcg_gen_mulu2_i64(t0, t1, cpu_avrh[rA(ctx->opcode)], val); \ > + tcg_gen_add2_i64(cpu_avrh[rD(ctx->opcode)], t2, t0, z, t2, z); \ > + \ > + if (ret_carry) { \ > + tcg_gen_add2_i64(cpu_avrl[rD(ctx->opcode)], t0, t1, z, t2, z); \ > + tcg_gen_movi_i64(cpu_avrh[rD(ctx->opcode)], 0); \ Likewise simplifies to if (ret_carry) { tcg_gen_mulu2_i64(t0, t1, cpu_avrh[rA(ctx->opcode)], ten); tcg_gen_add2_i64(t0, cpu_avrl[rD(ctx->opcode)], t0, t1, t2, z); tcg_gen_movi_i64(cpu_avrh[rD(ctx->opcode)], 0); } else { tcg_gen_mul_i64(t0, cpu_avrh[rA(ctx->opcode)], ten); tcg_gen_add_i64(cpu_avrh[rD(ctx->opcode)], t0, t2); } r~
On Wed, Sep 28, 2016 at 11:15:13AM +0530, Rajalakshmi Srinivasaraghavan wrote: > From: Vasant Hegde <hegdevasant@linux.vnet.ibm.com> > > vmul10uq : Vector Multiply-by-10 Unsigned Quadword VX-form > vmul10euq : Vector Multiply-by-10 Extended Unsigned Quadword VX-form > vmul10cuq : Vector Multiply-by-10 & write Carry Unsigned Quadword VX-form > vmul10ecuq: Vector Multiply-by-10 Extended & write Carry Unsigned Quadword VX-form > > Signed-off-by: Vasant Hegde <hegdevasant@linux.vnet.ibm.com> > [ Add GEN_VXFORM_DUAL_EXT with invalid bit mask ] > Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com> > Signed-off-by: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> > --- > target-ppc/translate/vmx-impl.inc.c | 74 +++++++++++++++++++++++++++++++++++ > target-ppc/translate/vmx-ops.inc.c | 8 ++-- > 2 files changed, 78 insertions(+), 4 deletions(-) > > diff --git a/target-ppc/translate/vmx-impl.inc.c b/target-ppc/translate/vmx-impl.inc.c > index 3ce374d..abfde27 100644 > --- a/target-ppc/translate/vmx-impl.inc.c > +++ b/target-ppc/translate/vmx-impl.inc.c > @@ -182,6 +182,54 @@ static void gen_mtvscr(DisasContext *ctx) > tcg_temp_free_ptr(p); > } > > +#define GEN_VX_VMUL10(name, add_cin, ret_carry) \ > +static void glue(gen_, name)(DisasContext *ctx) \ > +{ \ > + TCGv_i64 t0 = tcg_temp_new_i64(); \ > + TCGv_i64 t1 = tcg_temp_new_i64(); \ > + TCGv_i64 t2 = tcg_temp_new_i64(); \ > + TCGv_i64 cin = tcg_temp_new_i64(); \ > + TCGv_i64 val, z; \ > + \ > + if (unlikely(!ctx->altivec_enabled)) { \ > + gen_exception(ctx, POWERPC_EXCP_VPU); \ > + return; \ > + } \ > + \ > + val = tcg_const_i64(10); \ > + z = tcg_const_i64(0); \ > + \ > + if (add_cin) { \ > + tcg_gen_andi_i64(cin, cpu_avrl[rB(ctx->opcode)], 0xF); \ > + tcg_gen_movcond_i64(TCG_COND_LTU, cin, cin, val, cin, z); \ > + } else { \ > + tcg_gen_movi_i64(cin, 0); \ > + } \ > + \ > + tcg_gen_mulu2_i64(t0, t1, cpu_avrl[rA(ctx->opcode)], val); \ Do you really want to be using an actual mul op, rather than (in << 3) + (in << 1)? Obviously working out al the carries correctly will be a bit fiddly. > + tcg_gen_add2_i64(cpu_avrl[rD(ctx->opcode)], t2, t0, z, cin, z); \ > + tcg_gen_add2_i64(t2, t0, t1, z, t2, z); \ > + tcg_gen_mulu2_i64(t0, t1, cpu_avrh[rA(ctx->opcode)], val); \ > + tcg_gen_add2_i64(cpu_avrh[rD(ctx->opcode)], t2, t0, z, t2, z); \ > + \ > + if (ret_carry) { \ > + tcg_gen_add2_i64(cpu_avrl[rD(ctx->opcode)], t0, t1, z, t2, z); \ > + tcg_gen_movi_i64(cpu_avrh[rD(ctx->opcode)], 0); \ > + } \ > + \ > + tcg_temp_free_i64(t0); \ > + tcg_temp_free_i64(t1); \ > + tcg_temp_free_i64(t2); \ > + tcg_temp_free_i64(val); \ > + tcg_temp_free_i64(cin); \ > + tcg_temp_free_i64(z); \ > +} \ > + > +GEN_VX_VMUL10(vmul10uq, 0, 0); > +GEN_VX_VMUL10(vmul10euq, 1, 0); > +GEN_VX_VMUL10(vmul10cuq, 0, 1); > +GEN_VX_VMUL10(vmul10ecuq, 1, 1); > + > /* Logical operations */ > #define GEN_VX_LOGICAL(name, tcg_op, opc2, opc3) \ > static void glue(gen_, name)(DisasContext *ctx) \ > @@ -276,8 +324,30 @@ static void glue(gen_, name0##_##name1)(DisasContext *ctx) \ > } \ > } > > +/* Adds support to provide invalid mask */ > +#define GEN_VXFORM_DUAL_EXT(name0, flg0, flg2_0, inval0, \ > + name1, flg1, flg2_1, inval1) \ > +static void glue(gen_, name0##_##name1)(DisasContext *ctx) \ > +{ \ > + if ((Rc(ctx->opcode) == 0) && \ > + ((ctx->insns_flags & flg0) || (ctx->insns_flags2 & flg2_0)) && \ > + !(ctx->opcode & inval0)) { \ > + gen_##name0(ctx); \ > + } else if ((Rc(ctx->opcode) == 1) && \ > + ((ctx->insns_flags & flg1) || (ctx->insns_flags2 & flg2_1)) && \ > + !(ctx->opcode & inval1)) { \ > + gen_##name1(ctx); \ > + } else { \ > + gen_inval_exception(ctx, POWERPC_EXCP_INVAL_INVAL); \ > + } \ > +} > + > GEN_VXFORM(vaddubm, 0, 0); > +GEN_VXFORM_DUAL_EXT(vaddubm, PPC_NONE, PPC2_ALTIVEC_207, 0, \ > + vmul10cuq, PPC_NONE, PPC2_ISA300, 0x0000F800) > GEN_VXFORM(vadduhm, 0, 1); > +GEN_VXFORM_DUAL(vadduhm, PPC_NONE, PPC2_ALTIVEC_207, \ > + vmul10ecuq, PPC_NONE, PPC2_ISA300) > GEN_VXFORM(vadduwm, 0, 2); > GEN_VXFORM(vaddudm, 0, 3); > GEN_VXFORM(vsububm, 0, 16); > @@ -390,7 +460,11 @@ GEN_VXFORM(vsro, 6, 17); > GEN_VXFORM(vaddcuw, 0, 6); > GEN_VXFORM(vsubcuw, 0, 22); > GEN_VXFORM_ENV(vaddubs, 0, 8); > +GEN_VXFORM_DUAL_EXT(vaddubs, PPC_NONE, PPC2_ALTIVEC_207, 0, \ > + vmul10uq, PPC_NONE, PPC2_ISA300, 0x0000F800) > GEN_VXFORM_ENV(vadduhs, 0, 9); > +GEN_VXFORM_DUAL(vadduhs, PPC_NONE, PPC2_ALTIVEC_207, \ > + vmul10euq, PPC_NONE, PPC2_ISA300) > GEN_VXFORM_ENV(vadduws, 0, 10); > GEN_VXFORM_ENV(vaddsbs, 0, 12); > GEN_VXFORM_ENV(vaddshs, 0, 13); > diff --git a/target-ppc/translate/vmx-ops.inc.c b/target-ppc/translate/vmx-ops.inc.c > index a7022a0..5d47b0f 100644 > --- a/target-ppc/translate/vmx-ops.inc.c > +++ b/target-ppc/translate/vmx-ops.inc.c > @@ -55,8 +55,8 @@ GEN_HANDLER_E(name0##_##name1, 0x4, opc2, opc3, 0x00000000, type0, type1) > GEN_HANDLER_E(name0##_##name1, 0x4, opc2, opc3, 0x00000000, tp0, tp1), \ > GEN_HANDLER_E(name0##_##name1, 0x4, opc2, (opc3 | 0x10), 0x00000000, tp0, tp1), > > -GEN_VXFORM(vaddubm, 0, 0), > -GEN_VXFORM(vadduhm, 0, 1), > +GEN_VXFORM_DUAL(vaddubm, vmul10cuq, 0, 0, PPC_ALTIVEC, PPC_NONE), > +GEN_VXFORM_DUAL(vadduhm, vmul10ecuq, 0, 1, PPC_ALTIVEC, PPC_NONE), > GEN_VXFORM(vadduwm, 0, 2), > GEN_VXFORM_207(vaddudm, 0, 3), > GEN_VXFORM_DUAL(vsububm, bcdadd, 0, 16, PPC_ALTIVEC, PPC_NONE), > @@ -123,8 +123,8 @@ GEN_VXFORM(vslo, 6, 16), > GEN_VXFORM(vsro, 6, 17), > GEN_VXFORM(vaddcuw, 0, 6), > GEN_VXFORM(vsubcuw, 0, 22), > -GEN_VXFORM(vaddubs, 0, 8), > -GEN_VXFORM(vadduhs, 0, 9), > +GEN_VXFORM_DUAL(vaddubs, vmul10uq, 0, 8, PPC_ALTIVEC, PPC_NONE), > +GEN_VXFORM_DUAL(vadduhs, vmul10euq, 0, 9, PPC_ALTIVEC, PPC_NONE), > GEN_VXFORM(vadduws, 0, 10), > GEN_VXFORM(vaddsbs, 0, 12), > GEN_VXFORM(vaddshs, 0, 13),
On 09/28/2016 07:07 PM, David Gibson wrote: >> + tcg_gen_mulu2_i64(t0, t1, cpu_avrl[rA(ctx->opcode)], val); \ > > Do you really want to be using an actual mul op, rather than (in << 3) > + (in << 1)? Obviously working out al the carries correctly will be a > bit fiddly. I think it's fine. Modern hardware will do the double-word multiply in 3-5 cycles, which is probably equal to what we could do by hand with shifts. r~
On Wed, Sep 28, 2016 at 09:00:51PM -0700, Richard Henderson wrote: > On 09/28/2016 07:07 PM, David Gibson wrote: > > > + tcg_gen_mulu2_i64(t0, t1, cpu_avrl[rA(ctx->opcode)], val); \ > > > > Do you really want to be using an actual mul op, rather than (in << 3) > > + (in << 1)? Obviously working out al the carries correctly will be a > > bit fiddly. > > I think it's fine. Modern hardware will do the double-word multiply in 3-5 > cycles, which is probably equal to what we could do by hand with > shifts. Fair enough. And it will make for less dicking around with the carry in and carry out.
On 09/28/2016 10:12 PM, Richard Henderson wrote: > On 09/27/2016 10:45 PM, Rajalakshmi Srinivasaraghavan wrote: >> + val = tcg_const_i64(10); \ > Rename this "ten" for clarity? > >> + z = tcg_const_i64(0); \ >> + \ >> + if (add_cin) { \ >> + tcg_gen_andi_i64(cin, cpu_avrl[rB(ctx->opcode)], 0xF); \ >> + tcg_gen_movcond_i64(TCG_COND_LTU, cin, cin, val, cin, z); \ > What is the purpose of this movcond? The docs specifically say that values > greater than 9 are undefined. > >> + } else { \ >> + tcg_gen_movi_i64(cin, 0); \ >> + } \ >> + \ >> + tcg_gen_mulu2_i64(t0, t1, cpu_avrl[rA(ctx->opcode)], val); \ >> + tcg_gen_add2_i64(cpu_avrl[rD(ctx->opcode)], t2, t0, z, cin, z); \ >> + tcg_gen_add2_i64(t2, t0, t1, z, t2, z); \ > This two additions are unused if !add_cin, and the second appears to be > mergable with the first -- don't use so many z's. I think this simplifies to > > if (add_cin) { > tcg_gen_mulu2_i64(t0, t1, cpu_avrl[rA(ctx->opcode)], ten); > tcg_gen_andi_i64(t2, cpu_avrl[rB(ctx->opcode)], 0xF); > tcg_gen_add2_i64(cpu_avrl[rD(ctx->opcode)], t2, t0, t1, t2, z); > } else { > tcg_gen_mulu2_i64(cpu_avrl[rD(ctx->opcode)], t2, > cpu_avrl[rA(ctx->opcode)], ten); > } > >> + tcg_gen_mulu2_i64(t0, t1, cpu_avrh[rA(ctx->opcode)], val); \ >> + tcg_gen_add2_i64(cpu_avrh[rD(ctx->opcode)], t2, t0, z, t2, z); \ >> + \ >> + if (ret_carry) { \ >> + tcg_gen_add2_i64(cpu_avrl[rD(ctx->opcode)], t0, t1, z, t2, z); \ >> + tcg_gen_movi_i64(cpu_avrh[rD(ctx->opcode)], 0); \ > Likewise simplifies to > > if (ret_carry) { > tcg_gen_mulu2_i64(t0, t1, cpu_avrh[rA(ctx->opcode)], ten); > tcg_gen_add2_i64(t0, cpu_avrl[rD(ctx->opcode)], t0, t1, t2, z); > tcg_gen_movi_i64(cpu_avrh[rD(ctx->opcode)], 0); > } else { > tcg_gen_mul_i64(t0, cpu_avrh[rA(ctx->opcode)], ten); > tcg_gen_add_i64(cpu_avrh[rD(ctx->opcode)], t0, t2); > } > Will check and send updated patch. > r~ > >
diff --git a/target-ppc/translate/vmx-impl.inc.c b/target-ppc/translate/vmx-impl.inc.c index 3ce374d..abfde27 100644 --- a/target-ppc/translate/vmx-impl.inc.c +++ b/target-ppc/translate/vmx-impl.inc.c @@ -182,6 +182,54 @@ static void gen_mtvscr(DisasContext *ctx) tcg_temp_free_ptr(p); } +#define GEN_VX_VMUL10(name, add_cin, ret_carry) \ +static void glue(gen_, name)(DisasContext *ctx) \ +{ \ + TCGv_i64 t0 = tcg_temp_new_i64(); \ + TCGv_i64 t1 = tcg_temp_new_i64(); \ + TCGv_i64 t2 = tcg_temp_new_i64(); \ + TCGv_i64 cin = tcg_temp_new_i64(); \ + TCGv_i64 val, z; \ + \ + if (unlikely(!ctx->altivec_enabled)) { \ + gen_exception(ctx, POWERPC_EXCP_VPU); \ + return; \ + } \ + \ + val = tcg_const_i64(10); \ + z = tcg_const_i64(0); \ + \ + if (add_cin) { \ + tcg_gen_andi_i64(cin, cpu_avrl[rB(ctx->opcode)], 0xF); \ + tcg_gen_movcond_i64(TCG_COND_LTU, cin, cin, val, cin, z); \ + } else { \ + tcg_gen_movi_i64(cin, 0); \ + } \ + \ + tcg_gen_mulu2_i64(t0, t1, cpu_avrl[rA(ctx->opcode)], val); \ + tcg_gen_add2_i64(cpu_avrl[rD(ctx->opcode)], t2, t0, z, cin, z); \ + tcg_gen_add2_i64(t2, t0, t1, z, t2, z); \ + tcg_gen_mulu2_i64(t0, t1, cpu_avrh[rA(ctx->opcode)], val); \ + tcg_gen_add2_i64(cpu_avrh[rD(ctx->opcode)], t2, t0, z, t2, z); \ + \ + if (ret_carry) { \ + tcg_gen_add2_i64(cpu_avrl[rD(ctx->opcode)], t0, t1, z, t2, z); \ + tcg_gen_movi_i64(cpu_avrh[rD(ctx->opcode)], 0); \ + } \ + \ + tcg_temp_free_i64(t0); \ + tcg_temp_free_i64(t1); \ + tcg_temp_free_i64(t2); \ + tcg_temp_free_i64(val); \ + tcg_temp_free_i64(cin); \ + tcg_temp_free_i64(z); \ +} \ + +GEN_VX_VMUL10(vmul10uq, 0, 0); +GEN_VX_VMUL10(vmul10euq, 1, 0); +GEN_VX_VMUL10(vmul10cuq, 0, 1); +GEN_VX_VMUL10(vmul10ecuq, 1, 1); + /* Logical operations */ #define GEN_VX_LOGICAL(name, tcg_op, opc2, opc3) \ static void glue(gen_, name)(DisasContext *ctx) \ @@ -276,8 +324,30 @@ static void glue(gen_, name0##_##name1)(DisasContext *ctx) \ } \ } +/* Adds support to provide invalid mask */ +#define GEN_VXFORM_DUAL_EXT(name0, flg0, flg2_0, inval0, \ + name1, flg1, flg2_1, inval1) \ +static void glue(gen_, name0##_##name1)(DisasContext *ctx) \ +{ \ + if ((Rc(ctx->opcode) == 0) && \ + ((ctx->insns_flags & flg0) || (ctx->insns_flags2 & flg2_0)) && \ + !(ctx->opcode & inval0)) { \ + gen_##name0(ctx); \ + } else if ((Rc(ctx->opcode) == 1) && \ + ((ctx->insns_flags & flg1) || (ctx->insns_flags2 & flg2_1)) && \ + !(ctx->opcode & inval1)) { \ + gen_##name1(ctx); \ + } else { \ + gen_inval_exception(ctx, POWERPC_EXCP_INVAL_INVAL); \ + } \ +} + GEN_VXFORM(vaddubm, 0, 0); +GEN_VXFORM_DUAL_EXT(vaddubm, PPC_NONE, PPC2_ALTIVEC_207, 0, \ + vmul10cuq, PPC_NONE, PPC2_ISA300, 0x0000F800) GEN_VXFORM(vadduhm, 0, 1); +GEN_VXFORM_DUAL(vadduhm, PPC_NONE, PPC2_ALTIVEC_207, \ + vmul10ecuq, PPC_NONE, PPC2_ISA300) GEN_VXFORM(vadduwm, 0, 2); GEN_VXFORM(vaddudm, 0, 3); GEN_VXFORM(vsububm, 0, 16); @@ -390,7 +460,11 @@ GEN_VXFORM(vsro, 6, 17); GEN_VXFORM(vaddcuw, 0, 6); GEN_VXFORM(vsubcuw, 0, 22); GEN_VXFORM_ENV(vaddubs, 0, 8); +GEN_VXFORM_DUAL_EXT(vaddubs, PPC_NONE, PPC2_ALTIVEC_207, 0, \ + vmul10uq, PPC_NONE, PPC2_ISA300, 0x0000F800) GEN_VXFORM_ENV(vadduhs, 0, 9); +GEN_VXFORM_DUAL(vadduhs, PPC_NONE, PPC2_ALTIVEC_207, \ + vmul10euq, PPC_NONE, PPC2_ISA300) GEN_VXFORM_ENV(vadduws, 0, 10); GEN_VXFORM_ENV(vaddsbs, 0, 12); GEN_VXFORM_ENV(vaddshs, 0, 13); diff --git a/target-ppc/translate/vmx-ops.inc.c b/target-ppc/translate/vmx-ops.inc.c index a7022a0..5d47b0f 100644 --- a/target-ppc/translate/vmx-ops.inc.c +++ b/target-ppc/translate/vmx-ops.inc.c @@ -55,8 +55,8 @@ GEN_HANDLER_E(name0##_##name1, 0x4, opc2, opc3, 0x00000000, type0, type1) GEN_HANDLER_E(name0##_##name1, 0x4, opc2, opc3, 0x00000000, tp0, tp1), \ GEN_HANDLER_E(name0##_##name1, 0x4, opc2, (opc3 | 0x10), 0x00000000, tp0, tp1), -GEN_VXFORM(vaddubm, 0, 0), -GEN_VXFORM(vadduhm, 0, 1), +GEN_VXFORM_DUAL(vaddubm, vmul10cuq, 0, 0, PPC_ALTIVEC, PPC_NONE), +GEN_VXFORM_DUAL(vadduhm, vmul10ecuq, 0, 1, PPC_ALTIVEC, PPC_NONE), GEN_VXFORM(vadduwm, 0, 2), GEN_VXFORM_207(vaddudm, 0, 3), GEN_VXFORM_DUAL(vsububm, bcdadd, 0, 16, PPC_ALTIVEC, PPC_NONE), @@ -123,8 +123,8 @@ GEN_VXFORM(vslo, 6, 16), GEN_VXFORM(vsro, 6, 17), GEN_VXFORM(vaddcuw, 0, 6), GEN_VXFORM(vsubcuw, 0, 22), -GEN_VXFORM(vaddubs, 0, 8), -GEN_VXFORM(vadduhs, 0, 9), +GEN_VXFORM_DUAL(vaddubs, vmul10uq, 0, 8, PPC_ALTIVEC, PPC_NONE), +GEN_VXFORM_DUAL(vadduhs, vmul10euq, 0, 9, PPC_ALTIVEC, PPC_NONE), GEN_VXFORM(vadduws, 0, 10), GEN_VXFORM(vaddsbs, 0, 12), GEN_VXFORM(vaddshs, 0, 13),