diff mbox

[v2,5/6] target-ppc: add vprtyb[w/d/q] instructions

Message ID 1477463189-26971-6-git-send-email-nikunj@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nikunj A. Dadhania Oct. 26, 2016, 6:26 a.m. UTC
From: Ankit Kumar <ankit@linux.vnet.ibm.com>

Add following POWER ISA 3.0 instructions.
vprtybw: Vector Parity Byte Word
vprtybd: Vector Parity Byte Double Word
vprtybq: Vector Parity Byte Quad Word

Signed-off-by: Ankit Kumar <ankit@linux.vnet.ibm.com>
Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
---
 target-ppc/helper.h                 |  3 +++
 target-ppc/int_helper.c             | 31 +++++++++++++++++++++++++++++++
 target-ppc/translate/vmx-impl.inc.c |  3 +++
 target-ppc/translate/vmx-ops.inc.c  |  4 ++++
 4 files changed, 41 insertions(+)

Comments

David Gibson Oct. 27, 2016, 3:47 a.m. UTC | #1
On Wed, Oct 26, 2016 at 11:56:28AM +0530, Nikunj A Dadhania wrote:
> From: Ankit Kumar <ankit@linux.vnet.ibm.com>
> 
> Add following POWER ISA 3.0 instructions.
> vprtybw: Vector Parity Byte Word
> vprtybd: Vector Parity Byte Double Word
> vprtybq: Vector Parity Byte Quad Word
> 
> Signed-off-by: Ankit Kumar <ankit@linux.vnet.ibm.com>
> Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
> ---
>  target-ppc/helper.h                 |  3 +++
>  target-ppc/int_helper.c             | 31 +++++++++++++++++++++++++++++++
>  target-ppc/translate/vmx-impl.inc.c |  3 +++
>  target-ppc/translate/vmx-ops.inc.c  |  4 ++++
>  4 files changed, 41 insertions(+)
> 
> diff --git a/target-ppc/helper.h b/target-ppc/helper.h
> index d6ee26e..7d42f99 100644
> --- a/target-ppc/helper.h
> +++ b/target-ppc/helper.h
> @@ -223,6 +223,9 @@ DEF_HELPER_3(vsro, void, avr, avr, avr)
>  DEF_HELPER_3(vsrv, void, avr, avr, avr)
>  DEF_HELPER_3(vslv, void, avr, avr, avr)
>  DEF_HELPER_3(vaddcuw, void, avr, avr, avr)
> +DEF_HELPER_2(vprtybw, void, avr, avr)
> +DEF_HELPER_2(vprtybd, void, avr, avr)
> +DEF_HELPER_2(vprtybq, void, avr, avr)
>  DEF_HELPER_3(vsubcuw, void, avr, avr, avr)
>  DEF_HELPER_2(lvsl, void, avr, tl)
>  DEF_HELPER_2(lvsr, void, avr, tl)
> diff --git a/target-ppc/int_helper.c b/target-ppc/int_helper.c
> index 0fd92ed..358ffff 100644
> --- a/target-ppc/int_helper.c
> +++ b/target-ppc/int_helper.c
> @@ -527,6 +527,37 @@ void helper_vaddcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
>      }
>  }
>  
> +/* vprtyb[w/d] */
> +#define VPRTYB(name, element)                               \
> +void glue(helper_, name)(ppc_avr_t *r, ppc_avr_t *b)        \
> +{                                                           \
> +    int i, j;                                               \
> +    uint8_t s;                                              \
> +    int nr_b = sizeof(b->element[0]) / sizeof(b->u8[0]);    \
> +    for (i = 0; i < ARRAY_SIZE(r->element); i++) {          \
> +        s = 0;                                              \
> +        for (j = i * nr_b; j < (i + 1) * nr_b; j++) {       \
> +            s ^= (b->u8[j] & 1);                            \
> +        }                                                   \
> +        r->element[i] = (!s) ? 0 : 1;                       \
> +    }                                                       \
> +}
> +VPRTYB(vprtybw, u32)
> +VPRTYB(vprtybd, u64)
> +#undef VPTRYB
> +
> +/* vprtybq */
> +void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b)
> +{
> +    int i;
> +    uint8_t s = 0;
> +    for (i = 0; i < 16; i++) {
> +        s ^= (b->u8[i] & 1);
> +    }
> +    r->u64[LO_IDX] = (!s) ? 0 : 1;
> +    r->u64[HI_IDX] = 0;
> +}
> +

I think you can implement these better.  First mask with 0x01010101
(of the appropriate length) to extract the LSB bits of each byte.
Then XOR the two halves together, then quarters and so forth,
ln2(size) times to arrive at the parity.  This is similar to the usual
Hamming weight implementation.

>  #define VARITH_DO(name, op, element)                                    \
>      void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)       \
>      {                                                                   \
> diff --git a/target-ppc/translate/vmx-impl.inc.c b/target-ppc/translate/vmx-impl.inc.c
> index 500c43f..e1d0897 100644
> --- a/target-ppc/translate/vmx-impl.inc.c
> +++ b/target-ppc/translate/vmx-impl.inc.c
> @@ -705,6 +705,9 @@ GEN_VXFORM_NOA_ENV(vrfim, 5, 11);
>  GEN_VXFORM_NOA_ENV(vrfin, 5, 8);
>  GEN_VXFORM_NOA_ENV(vrfip, 5, 10);
>  GEN_VXFORM_NOA_ENV(vrfiz, 5, 9);
> +GEN_VXFORM_NOA(vprtybw, 1, 24);
> +GEN_VXFORM_NOA(vprtybd, 1, 24);
> +GEN_VXFORM_NOA(vprtybq, 1, 24);
>  
>  #define GEN_VXFORM_SIMM(name, opc2, opc3)                               \
>  static void glue(gen_, name)(DisasContext *ctx)                                 \
> diff --git a/target-ppc/translate/vmx-ops.inc.c b/target-ppc/translate/vmx-ops.inc.c
> index a5ad4d4..c631780 100644
> --- a/target-ppc/translate/vmx-ops.inc.c
> +++ b/target-ppc/translate/vmx-ops.inc.c
> @@ -122,6 +122,10 @@ GEN_VXFORM_300(vslv, 2, 29),
>  GEN_VXFORM(vslo, 6, 16),
>  GEN_VXFORM(vsro, 6, 17),
>  GEN_VXFORM(vaddcuw, 0, 6),
> +GEN_HANDLER_E_2(vprtybw, 0x4, 0x1, 0x18, 8, 0, PPC_NONE, PPC2_ISA300),
> +GEN_HANDLER_E_2(vprtybd, 0x4, 0x1, 0x18, 9, 0, PPC_NONE, PPC2_ISA300),
> +GEN_HANDLER_E_2(vprtybq, 0x4, 0x1, 0x18, 10, 0, PPC_NONE, PPC2_ISA300),
> +
>  GEN_VXFORM(vsubcuw, 0, 22),
>  GEN_VXFORM_DUAL(vaddubs, vmul10uq, 0, 8, PPC_ALTIVEC, PPC_NONE),
>  GEN_VXFORM_DUAL(vadduhs, vmul10euq, 0, 9, PPC_ALTIVEC, PPC_NONE),
Richard Henderson Oct. 27, 2016, 5:22 a.m. UTC | #2
On 10/26/2016 08:47 PM, David Gibson wrote:
>> > +void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b)
>> > +{
>> > +    int i;
>> > +    uint8_t s = 0;
>> > +    for (i = 0; i < 16; i++) {
>> > +        s ^= (b->u8[i] & 1);
>> > +    }
>> > +    r->u64[LO_IDX] = (!s) ? 0 : 1;
>> > +    r->u64[HI_IDX] = 0;
>> > +}
>> > +
> I think you can implement these better.  First mask with 0x01010101
> (of the appropriate length) to extract the LSB bits of each byte.
> Then XOR the two halves together, then quarters and so forth,
> ln2(size) times to arrive at the parity.  This is similar to the usual
> Hamming weight implementation.
>

You don't even have to mask with 0x01010101 to start.  Just fold halves til you 
get to the byte level and then mask with 1.


r~
Nikunj A. Dadhania Oct. 27, 2016, 8:36 a.m. UTC | #3
Richard Henderson <rth@twiddle.net> writes:

> On 10/26/2016 08:47 PM, David Gibson wrote:
>>> > +void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b)
>>> > +{
>>> > +    int i;
>>> > +    uint8_t s = 0;
>>> > +    for (i = 0; i < 16; i++) {
>>> > +        s ^= (b->u8[i] & 1);
>>> > +    }
>>> > +    r->u64[LO_IDX] = (!s) ? 0 : 1;
>>> > +    r->u64[HI_IDX] = 0;
>>> > +}
>>> > +
>> I think you can implement these better.  First mask with 0x01010101
>> (of the appropriate length) to extract the LSB bits of each byte.
>> Then XOR the two halves together, then quarters and so forth,
>> ln2(size) times to arrive at the parity.  This is similar to the usual
>> Hamming weight implementation.
>>
>
> You don't even have to mask with 0x01010101 to start.  Just fold halves til you 
> get to the byte level and then mask with 1.

Right, it does reduce number of operations:

+#define SIZE_MASK(x) ((1ULL << (x)) - 1)
+static uint64_t vparity(uint64_t f1, uint64_t f2, int size)
+{
+    uint64_t res = f1 ^ f2;
+    if (size == 8) return res;
+    return vparity(res & SIZE_MASK(size/2), res >> (size/2), size/2);
+}
+

Regards
Nikunj
David Gibson Oct. 27, 2016, 1:28 p.m. UTC | #4
On Wed, Oct 26, 2016 at 10:22:10PM -0700, Richard Henderson wrote:
> On 10/26/2016 08:47 PM, David Gibson wrote:
> > > > +void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b)
> > > > +{
> > > > +    int i;
> > > > +    uint8_t s = 0;
> > > > +    for (i = 0; i < 16; i++) {
> > > > +        s ^= (b->u8[i] & 1);
> > > > +    }
> > > > +    r->u64[LO_IDX] = (!s) ? 0 : 1;
> > > > +    r->u64[HI_IDX] = 0;
> > > > +}
> > > > +
> > I think you can implement these better.  First mask with 0x01010101
> > (of the appropriate length) to extract the LSB bits of each byte.
> > Then XOR the two halves together, then quarters and so forth,
> > ln2(size) times to arrive at the parity.  This is similar to the usual
> > Hamming weight implementation.
> > 
> 
> You don't even have to mask with 0x01010101 to start.  Just fold halves til
> you get to the byte level and then mask with 1.

Good point.
Richard Henderson Oct. 27, 2016, 2:16 p.m. UTC | #5
On 10/27/2016 01:36 AM, Nikunj A Dadhania wrote:
> Right, it does reduce number of operations:
>
> +#define SIZE_MASK(x) ((1ULL << (x)) - 1)
> +static uint64_t vparity(uint64_t f1, uint64_t f2, int size)
> +{
> +    uint64_t res = f1 ^ f2;
> +    if (size == 8) return res;
> +    return vparity(res & SIZE_MASK(size/2), res >> (size/2), size/2);
> +}

Why are you using recursion for something that should be 5 operations?  You're 
making this more complicated than it needs to be.

   uint64_t res = b->u64[0] ^ b->u64[1];
   res ^= res >> 32;
   res ^= res >> 16;
   res ^= res >> 8;
   r->u64[LO_IDX] = res & 1;


r~
David Gibson Oct. 28, 2016, 1:34 a.m. UTC | #6
On Thu, Oct 27, 2016 at 07:16:24AM -0700, Richard Henderson wrote:
> On 10/27/2016 01:36 AM, Nikunj A Dadhania wrote:
> > Right, it does reduce number of operations:
> > 
> > +#define SIZE_MASK(x) ((1ULL << (x)) - 1)
> > +static uint64_t vparity(uint64_t f1, uint64_t f2, int size)
> > +{
> > +    uint64_t res = f1 ^ f2;
> > +    if (size == 8) return res;
> > +    return vparity(res & SIZE_MASK(size/2), res >> (size/2), size/2);
> > +}
> 
> Why are you using recursion for something that should be 5 operations?
> You're making this more complicated than it needs to be.
> 
>   uint64_t res = b->u64[0] ^ b->u64[1];
>   res ^= res >> 32;
>   res ^= res >> 16;
>   res ^= res >> 8;
>   r->u64[LO_IDX] = res & 1;

We do need to implement it at multiple sizes, which makes it a bit
more complex.  But I wonder if it makes sense to do this without a
helper, something like

gen_vprty()
{
    ...
    gen(shift right 8)
    gen(xor)
    if (size > 2)
       gen(shift right 16)
       gen(xor)
    if (size > 4)
       gen(shift right 32)
       gen(xor)
    if (size > 8)
       gen(xor hi and low words)
    gen(mask result bits)
}
diff mbox

Patch

diff --git a/target-ppc/helper.h b/target-ppc/helper.h
index d6ee26e..7d42f99 100644
--- a/target-ppc/helper.h
+++ b/target-ppc/helper.h
@@ -223,6 +223,9 @@  DEF_HELPER_3(vsro, void, avr, avr, avr)
 DEF_HELPER_3(vsrv, void, avr, avr, avr)
 DEF_HELPER_3(vslv, void, avr, avr, avr)
 DEF_HELPER_3(vaddcuw, void, avr, avr, avr)
+DEF_HELPER_2(vprtybw, void, avr, avr)
+DEF_HELPER_2(vprtybd, void, avr, avr)
+DEF_HELPER_2(vprtybq, void, avr, avr)
 DEF_HELPER_3(vsubcuw, void, avr, avr, avr)
 DEF_HELPER_2(lvsl, void, avr, tl)
 DEF_HELPER_2(lvsr, void, avr, tl)
diff --git a/target-ppc/int_helper.c b/target-ppc/int_helper.c
index 0fd92ed..358ffff 100644
--- a/target-ppc/int_helper.c
+++ b/target-ppc/int_helper.c
@@ -527,6 +527,37 @@  void helper_vaddcuw(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)
     }
 }
 
+/* vprtyb[w/d] */
+#define VPRTYB(name, element)                               \
+void glue(helper_, name)(ppc_avr_t *r, ppc_avr_t *b)        \
+{                                                           \
+    int i, j;                                               \
+    uint8_t s;                                              \
+    int nr_b = sizeof(b->element[0]) / sizeof(b->u8[0]);    \
+    for (i = 0; i < ARRAY_SIZE(r->element); i++) {          \
+        s = 0;                                              \
+        for (j = i * nr_b; j < (i + 1) * nr_b; j++) {       \
+            s ^= (b->u8[j] & 1);                            \
+        }                                                   \
+        r->element[i] = (!s) ? 0 : 1;                       \
+    }                                                       \
+}
+VPRTYB(vprtybw, u32)
+VPRTYB(vprtybd, u64)
+#undef VPTRYB
+
+/* vprtybq */
+void helper_vprtybq(ppc_avr_t *r, ppc_avr_t *b)
+{
+    int i;
+    uint8_t s = 0;
+    for (i = 0; i < 16; i++) {
+        s ^= (b->u8[i] & 1);
+    }
+    r->u64[LO_IDX] = (!s) ? 0 : 1;
+    r->u64[HI_IDX] = 0;
+}
+
 #define VARITH_DO(name, op, element)                                    \
     void helper_v##name(ppc_avr_t *r, ppc_avr_t *a, ppc_avr_t *b)       \
     {                                                                   \
diff --git a/target-ppc/translate/vmx-impl.inc.c b/target-ppc/translate/vmx-impl.inc.c
index 500c43f..e1d0897 100644
--- a/target-ppc/translate/vmx-impl.inc.c
+++ b/target-ppc/translate/vmx-impl.inc.c
@@ -705,6 +705,9 @@  GEN_VXFORM_NOA_ENV(vrfim, 5, 11);
 GEN_VXFORM_NOA_ENV(vrfin, 5, 8);
 GEN_VXFORM_NOA_ENV(vrfip, 5, 10);
 GEN_VXFORM_NOA_ENV(vrfiz, 5, 9);
+GEN_VXFORM_NOA(vprtybw, 1, 24);
+GEN_VXFORM_NOA(vprtybd, 1, 24);
+GEN_VXFORM_NOA(vprtybq, 1, 24);
 
 #define GEN_VXFORM_SIMM(name, opc2, opc3)                               \
 static void glue(gen_, name)(DisasContext *ctx)                                 \
diff --git a/target-ppc/translate/vmx-ops.inc.c b/target-ppc/translate/vmx-ops.inc.c
index a5ad4d4..c631780 100644
--- a/target-ppc/translate/vmx-ops.inc.c
+++ b/target-ppc/translate/vmx-ops.inc.c
@@ -122,6 +122,10 @@  GEN_VXFORM_300(vslv, 2, 29),
 GEN_VXFORM(vslo, 6, 16),
 GEN_VXFORM(vsro, 6, 17),
 GEN_VXFORM(vaddcuw, 0, 6),
+GEN_HANDLER_E_2(vprtybw, 0x4, 0x1, 0x18, 8, 0, PPC_NONE, PPC2_ISA300),
+GEN_HANDLER_E_2(vprtybd, 0x4, 0x1, 0x18, 9, 0, PPC_NONE, PPC2_ISA300),
+GEN_HANDLER_E_2(vprtybq, 0x4, 0x1, 0x18, 10, 0, PPC_NONE, PPC2_ISA300),
+
 GEN_VXFORM(vsubcuw, 0, 22),
 GEN_VXFORM_DUAL(vaddubs, vmul10uq, 0, 8, PPC_ALTIVEC, PPC_NONE),
 GEN_VXFORM_DUAL(vadduhs, vmul10euq, 0, 9, PPC_ALTIVEC, PPC_NONE),