Message ID | 1477463189-26971-6-git-send-email-nikunj@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed, Oct 26, 2016 at 11:56:28AM +0530, Nikunj A Dadhania wrote: > From: Ankit Kumar <ankit@linux.vnet.ibm.com> > > Add following POWER ISA 3.0 instructions. > vprtybw: Vector Parity Byte Word > vprtybd: Vector Parity Byte Double Word > vprtybq: Vector Parity Byte Quad Word > > Signed-off-by: Ankit Kumar <ankit@linux.vnet.ibm.com> > Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com> > --- > target-ppc/helper.h | 3 +++ > target-ppc/int_helper.c | 31 +++++++++++++++++++++++++++++++ > target-ppc/translate/vmx-impl.inc.c | 3 +++ > target-ppc/translate/vmx-ops.inc.c | 4 ++++ > 4 files changed, 41 insertions(+) > > diff --git a/target-ppc/helper.h b/target-ppc/helper.h > index d6ee26e..7d42f99 100644 > --- a/target-ppc/helper.h > +++ b/target-ppc/helper.h > @@ -223,6 +223,9 @@ DEF_HELPER_3(vsro, void, avr, avr, avr) > DEF_HELPER_3(vsrv, void, avr, avr, avr) > DEF_HELPER_3(vslv, void, avr, avr, avr) > DEF_HELPER_3(vaddcuw, void, avr, avr, avr) > +DEF_HELPER_2(vprtybw, void, avr, avr) > +DEF_HELPER_2(vprtybd, void, avr, avr) > +DEF_HELPER_2(vprtybq, void, avr, avr) > DEF_HELPER_3(vsubcuw, void, avr, avr, avr) > DEF_HELPER_2(lvsl, void, avr, tl) > DEF_HELPER_2(lvsr, void, avr, tl) > diff --git a/target-ppc/int_helper.c b/target-ppc/int_helper.c > index 0fd92ed..358ffff 100644 > --- a/target-ppc/int_helper.c > +++ b/target-ppc/int_helper.c > @@ -527,6 +527,37 @@ void helper_vaddcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) > } > } > > +/* vprtyb[w/d] */ > +#define VPRTYB(name, element) \ > +void glue(helper_, name)(ppc_avr_t *r, ppc_avr_t *b) \ > +{ \ > + int i, j; \ > + uint8_t s; \ > + int nr_b = sizeof(b->element[0]) / sizeof(b->u8[0]); \ > + for (i = 0; i < ARRAY_SIZE(r->element); i++) { \ > + s = 0; \ > + for (j = i * nr_b; j < (i + 1) * nr_b; j++) { \ > + s ^= (b->u8[j] & 1); \ > + } \ > + r->element[i] = (!s) ? 0 : 1; \ > + } \ > +} > +VPRTYB(vprtybw, u32) > +VPRTYB(vprtybd, u64) > +#undef VPTRYB > + > +/* vprtybq */ > +void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b) > +{ > + int i; > + uint8_t s = 0; > + for (i = 0; i < 16; i++) { > + s ^= (b->u8[i] & 1); > + } > + r->u64[LO_IDX] = (!s) ? 0 : 1; > + r->u64[HI_IDX] = 0; > +} > + I think you can implement these better. First mask with 0x01010101 (of the appropriate length) to extract the LSB bits of each byte. Then XOR the two halves together, then quarters and so forth, ln2(size) times to arrive at the parity. This is similar to the usual Hamming weight implementation. > #define VARITH_DO(name, op, element) \ > void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ > { \ > diff --git a/target-ppc/translate/vmx-impl.inc.c b/target-ppc/translate/vmx-impl.inc.c > index 500c43f..e1d0897 100644 > --- a/target-ppc/translate/vmx-impl.inc.c > +++ b/target-ppc/translate/vmx-impl.inc.c > @@ -705,6 +705,9 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11); > GEN_VXFORM_NOA_ENV(vrfin, 5, 8); > GEN_VXFORM_NOA_ENV(vrfip, 5, 10); > GEN_VXFORM_NOA_ENV(vrfiz, 5, 9); > +GEN_VXFORM_NOA(vprtybw, 1, 24); > +GEN_VXFORM_NOA(vprtybd, 1, 24); > +GEN_VXFORM_NOA(vprtybq, 1, 24); > > #define GEN_VXFORM_SIMM(name, opc2, opc3) \ > static void glue(gen_, name)(DisasContext *ctx) \ > diff --git a/target-ppc/translate/vmx-ops.inc.c b/target-ppc/translate/vmx-ops.inc.c > index a5ad4d4..c631780 100644 > --- a/target-ppc/translate/vmx-ops.inc.c > +++ b/target-ppc/translate/vmx-ops.inc.c > @@ -122,6 +122,10 @@ GEN_VXFORM_300(vslv, 2, 29), > GEN_VXFORM(vslo, 6, 16), > GEN_VXFORM(vsro, 6, 17), > GEN_VXFORM(vaddcuw, 0, 6), > +GEN_HANDLER_E_2(vprtybw, 0x4, 0x1, 0x18, 8, 0, PPC_NONE, PPC2_ISA300), > +GEN_HANDLER_E_2(vprtybd, 0x4, 0x1, 0x18, 9, 0, PPC_NONE, PPC2_ISA300), > +GEN_HANDLER_E_2(vprtybq, 0x4, 0x1, 0x18, 10, 0, PPC_NONE, PPC2_ISA300), > + > GEN_VXFORM(vsubcuw, 0, 22), > GEN_VXFORM_DUAL(vaddubs, vmul10uq, 0, 8, PPC_ALTIVEC, PPC_NONE), > GEN_VXFORM_DUAL(vadduhs, vmul10euq, 0, 9, PPC_ALTIVEC, PPC_NONE),
On 10/26/2016 08:47 PM, David Gibson wrote: >> > +void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b) >> > +{ >> > + int i; >> > + uint8_t s = 0; >> > + for (i = 0; i < 16; i++) { >> > + s ^= (b->u8[i] & 1); >> > + } >> > + r->u64[LO_IDX] = (!s) ? 0 : 1; >> > + r->u64[HI_IDX] = 0; >> > +} >> > + > I think you can implement these better. First mask with 0x01010101 > (of the appropriate length) to extract the LSB bits of each byte. > Then XOR the two halves together, then quarters and so forth, > ln2(size) times to arrive at the parity. This is similar to the usual > Hamming weight implementation. > You don't even have to mask with 0x01010101 to start. Just fold halves til you get to the byte level and then mask with 1. r~
Richard Henderson <rth@twiddle.net> writes: > On 10/26/2016 08:47 PM, David Gibson wrote: >>> > +void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b) >>> > +{ >>> > + int i; >>> > + uint8_t s = 0; >>> > + for (i = 0; i < 16; i++) { >>> > + s ^= (b->u8[i] & 1); >>> > + } >>> > + r->u64[LO_IDX] = (!s) ? 0 : 1; >>> > + r->u64[HI_IDX] = 0; >>> > +} >>> > + >> I think you can implement these better. First mask with 0x01010101 >> (of the appropriate length) to extract the LSB bits of each byte. >> Then XOR the two halves together, then quarters and so forth, >> ln2(size) times to arrive at the parity. This is similar to the usual >> Hamming weight implementation. >> > > You don't even have to mask with 0x01010101 to start. Just fold halves til you > get to the byte level and then mask with 1. Right, it does reduce number of operations: +#define SIZE_MASK(x) ((1ULL << (x)) - 1) +static uint64_t vparity(uint64_t f1, uint64_t f2, int size) +{ + uint64_t res = f1 ^ f2; + if (size == 8) return res; + return vparity(res & SIZE_MASK(size/2), res >> (size/2), size/2); +} + Regards Nikunj
On Wed, Oct 26, 2016 at 10:22:10PM -0700, Richard Henderson wrote: > On 10/26/2016 08:47 PM, David Gibson wrote: > > > > +void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b) > > > > +{ > > > > + int i; > > > > + uint8_t s = 0; > > > > + for (i = 0; i < 16; i++) { > > > > + s ^= (b->u8[i] & 1); > > > > + } > > > > + r->u64[LO_IDX] = (!s) ? 0 : 1; > > > > + r->u64[HI_IDX] = 0; > > > > +} > > > > + > > I think you can implement these better. First mask with 0x01010101 > > (of the appropriate length) to extract the LSB bits of each byte. > > Then XOR the two halves together, then quarters and so forth, > > ln2(size) times to arrive at the parity. This is similar to the usual > > Hamming weight implementation. > > > > You don't even have to mask with 0x01010101 to start. Just fold halves til > you get to the byte level and then mask with 1. Good point.
On 10/27/2016 01:36 AM, Nikunj A Dadhania wrote: > Right, it does reduce number of operations: > > +#define SIZE_MASK(x) ((1ULL << (x)) - 1) > +static uint64_t vparity(uint64_t f1, uint64_t f2, int size) > +{ > + uint64_t res = f1 ^ f2; > + if (size == 8) return res; > + return vparity(res & SIZE_MASK(size/2), res >> (size/2), size/2); > +} Why are you using recursion for something that should be 5 operations? You're making this more complicated than it needs to be. uint64_t res = b->u64[0] ^ b->u64[1]; res ^= res >> 32; res ^= res >> 16; res ^= res >> 8; r->u64[LO_IDX] = res & 1; r~
On Thu, Oct 27, 2016 at 07:16:24AM -0700, Richard Henderson wrote: > On 10/27/2016 01:36 AM, Nikunj A Dadhania wrote: > > Right, it does reduce number of operations: > > > > +#define SIZE_MASK(x) ((1ULL << (x)) - 1) > > +static uint64_t vparity(uint64_t f1, uint64_t f2, int size) > > +{ > > + uint64_t res = f1 ^ f2; > > + if (size == 8) return res; > > + return vparity(res & SIZE_MASK(size/2), res >> (size/2), size/2); > > +} > > Why are you using recursion for something that should be 5 operations? > You're making this more complicated than it needs to be. > > uint64_t res = b->u64[0] ^ b->u64[1]; > res ^= res >> 32; > res ^= res >> 16; > res ^= res >> 8; > r->u64[LO_IDX] = res & 1; We do need to implement it at multiple sizes, which makes it a bit more complex. But I wonder if it makes sense to do this without a helper, something like gen_vprty() { ... gen(shift right 8) gen(xor) if (size > 2) gen(shift right 16) gen(xor) if (size > 4) gen(shift right 32) gen(xor) if (size > 8) gen(xor hi and low words) gen(mask result bits) }
diff --git a/target-ppc/helper.h b/target-ppc/helper.h index d6ee26e..7d42f99 100644 --- a/target-ppc/helper.h +++ b/target-ppc/helper.h @@ -223,6 +223,9 @@ DEF_HELPER_3(vsro, void, avr, avr, avr) DEF_HELPER_3(vsrv, void, avr, avr, avr) DEF_HELPER_3(vslv, void, avr, avr, avr) DEF_HELPER_3(vaddcuw, void, avr, avr, avr) +DEF_HELPER_2(vprtybw, void, avr, avr) +DEF_HELPER_2(vprtybd, void, avr, avr) +DEF_HELPER_2(vprtybq, void, avr, avr) DEF_HELPER_3(vsubcuw, void, avr, avr, avr) DEF_HELPER_2(lvsl, void, avr, tl) DEF_HELPER_2(lvsr, void, avr, tl) diff --git a/target-ppc/int_helper.c b/target-ppc/int_helper.c index 0fd92ed..358ffff 100644 --- a/target-ppc/int_helper.c +++ b/target-ppc/int_helper.c @@ -527,6 +527,37 @@ void helper_vaddcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) } } +/* vprtyb[w/d] */ +#define VPRTYB(name, element) \ +void glue(helper_, name)(ppc_avr_t *r, ppc_avr_t *b) \ +{ \ + int i, j; \ + uint8_t s; \ + int nr_b = sizeof(b->element[0]) / sizeof(b->u8[0]); \ + for (i = 0; i < ARRAY_SIZE(r->element); i++) { \ + s = 0; \ + for (j = i * nr_b; j < (i + 1) * nr_b; j++) { \ + s ^= (b->u8[j] & 1); \ + } \ + r->element[i] = (!s) ? 0 : 1; \ + } \ +} +VPRTYB(vprtybw, u32) +VPRTYB(vprtybd, u64) +#undef VPTRYB + +/* vprtybq */ +void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b) +{ + int i; + uint8_t s = 0; + for (i = 0; i < 16; i++) { + s ^= (b->u8[i] & 1); + } + r->u64[LO_IDX] = (!s) ? 0 : 1; + r->u64[HI_IDX] = 0; +} + #define VARITH_DO(name, op, element) \ void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b) \ { \ diff --git a/target-ppc/translate/vmx-impl.inc.c b/target-ppc/translate/vmx-impl.inc.c index 500c43f..e1d0897 100644 --- a/target-ppc/translate/vmx-impl.inc.c +++ b/target-ppc/translate/vmx-impl.inc.c @@ -705,6 +705,9 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11); GEN_VXFORM_NOA_ENV(vrfin, 5, 8); GEN_VXFORM_NOA_ENV(vrfip, 5, 10); GEN_VXFORM_NOA_ENV(vrfiz, 5, 9); +GEN_VXFORM_NOA(vprtybw, 1, 24); +GEN_VXFORM_NOA(vprtybd, 1, 24); +GEN_VXFORM_NOA(vprtybq, 1, 24); #define GEN_VXFORM_SIMM(name, opc2, opc3) \ static void glue(gen_, name)(DisasContext *ctx) \ diff --git a/target-ppc/translate/vmx-ops.inc.c b/target-ppc/translate/vmx-ops.inc.c index a5ad4d4..c631780 100644 --- a/target-ppc/translate/vmx-ops.inc.c +++ b/target-ppc/translate/vmx-ops.inc.c @@ -122,6 +122,10 @@ GEN_VXFORM_300(vslv, 2, 29), GEN_VXFORM(vslo, 6, 16), GEN_VXFORM(vsro, 6, 17), GEN_VXFORM(vaddcuw, 0, 6), +GEN_HANDLER_E_2(vprtybw, 0x4, 0x1, 0x18, 8, 0, PPC_NONE, PPC2_ISA300), +GEN_HANDLER_E_2(vprtybd, 0x4, 0x1, 0x18, 9, 0, PPC_NONE, PPC2_ISA300), +GEN_HANDLER_E_2(vprtybq, 0x4, 0x1, 0x18, 10, 0, PPC_NONE, PPC2_ISA300), + GEN_VXFORM(vsubcuw, 0, 22), GEN_VXFORM_DUAL(vaddubs, vmul10uq, 0, 8, PPC_ALTIVEC, PPC_NONE), GEN_VXFORM_DUAL(vadduhs, vmul10euq, 0, 9, PPC_ALTIVEC, PPC_NONE),