Message ID | 20220210123447.3933301-19-matheus.ferst@eldorado.org.br (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | target/ppc: PowerISA Vector/VSX instruction batch | expand |
On 2/10/22 23:34, matheus.ferst@eldorado.org.br wrote: > + for (int dw = 1; dw >= 0; dw--) { > + get_avr64(vrb, a->vrb, dw); > + for (; in >= 0; in -= a->n, out--) { > + if (in > out) { > + tcg_gen_shri_i64(tmp, vrb, in - out); > + } else { > + tcg_gen_shli_i64(tmp, vrb, out - in); > + } > + tcg_gen_andi_i64(tmp, tmp, 1ULL << out); > + tcg_gen_or_i64(rt, rt, tmp); > + } > + in += 64; > + } This is going to produce up to 3*64 operations (n=2). You can produce more than one output pairing per shift, and produce the same result in 3*lg2(64) operations. I've given an example like this on the list before, recently. I think it was in the context of some riscv bit manipulation. > N = 2 > > AxBxCxDxExFxGxHxIxJxKxLxMxNxOxPxQxRxSxTxUxVxWxXxYxZx0x1x2x3x4x5x > & rep(0b10) > A.B.C.D.E.F.G.H.I.J.K.L.M.N.O.P.Q.R.S.T.U.V.W.X.Y.Z.0.1.2.3.4.5. > << 1 > .B.C.D.E.F.G.H.I.J.K.L.M.N.O.P.Q.R.S.T.U.V.W.X.Y.Z.0.1.2.3.4.5.. > | > ABBCCDDEEFFGGHHIIJJKKLLMMNNOOPPQQRRSSTTUUVVWWXXYYZZ001122334455. > & rep(0b1100) > AB..CD..EF..GH..IJ..KL..MN..OP..QR..ST..UV..WX..YZ..01..23..45.. > << 2 > ..CD..EF..GH..IJ..KL..MN..OP..QR..ST..UV..WX..YZ..01..23..45.... > | > ABCDCDEFEFGHGHIJIJKLKLMNMNOPOPWQQRSTSTUVUVWXWXYZYZ010123234545.. > & rep(0xf0) > ABCD....EFGH....IJKL....MNOP....QRST....UVWX....YZ01....2345.... > << 4 > ....EFGH....IJKL....MNOP....QRST....UVWX....YZ01....2345........ > | > ABCDEFGHEFGHIJKLIJKLMNOPMNOPQRSTQRSTUVWXUVWXYZ01YZ0123452345.... > & rep(0xff00) > ABCDEFGH........IJKLMNOP........QRSTUVWX........YZ012345........ > << 8 > ........IJKLMNOP........QRSTUVWX........YZ012345................ > | > ABCDEFGHIJKLMNOPIJKLMNOPQRSTUVWXQRSTUVWXYZ012345YZ012345........ > & rep(0xffff0000) > ABCDEFGHIJKLMNOP................QRSTUVWXYZ012345................ > deposit(t, 32, 16) > ABCDEFGHIJKLMNOPQRSTUVWXYZ012346................................ and similarly for larger N. For N >= 4, I believe that half of the masking may be elided, because there are already zeros in which to place bits. > N = 5 > > AxxxxBxxxxCxxxxDxxxxExxxxFxxxxGxxxxHxxxxIxxxxJxxxxKxxxxLxxxxMxxx > & rep(0b10000) > A....B....C....D....E....F....G....H....I....J....K....L....M... > << (5 - 1) > .B....C....D....E....F....G....H....I....J....K....L....M....... > | > AB...BC...CD...DE...EF...FG...GH...HI...IJ...JK...KL...LM...M... > << (10 - 2) > ..CD...DE...EF...FG...GH...HI...IJ...JK...KL...LM...M... > | > ABCD.BCDE.CDEF.DEFG.EFGH.FGHI.GHIJ.HIJK.IJKL.JKLM.KLM..LM...M... > & rep(0xf0000) > ABCD................EFGH................IJKL................M... > << (20 - 4) > ....EFGH................IJKL................M................... > | > ABCDEFGH............EFGHIJKL............IJKLM...............M... > << (40 - 8) > ........IJKLM...............M................................... > | > ABCDEFGHIJKLM.......EFGHIJKLM...........IJKLM...............M... > & 0xfff8_0000_0000_0000 > ABCDEFGHIJKLM................................................... It's probably worth working through the various N to make sure you know which masking is required. r~
diff --git a/target/ppc/insn32.decode b/target/ppc/insn32.decode index 0a3e39f3e9..7b629e81af 100644 --- a/target/ppc/insn32.decode +++ b/target/ppc/insn32.decode @@ -66,6 +66,9 @@ &VX_mp rt mp:bool vrb @VX_mp ...... rt:5 .... mp:1 vrb:5 ........... &VX_mp +&VX_n rt vrb n +@VX_n ...... rt:5 .. n:3 vrb:5 ........... &VX_n + &VX_tb_rc vrt vrb rc:bool @VX_tb_rc ...... vrt:5 ..... vrb:5 rc:1 .......... &VX_tb_rc @@ -418,6 +421,8 @@ VCMPUQ 000100 ... -- ..... ..... 00100000001 @VX_bf ## Vector Bit Manipulation Instruction +VGNB 000100 ..... -- ... ..... 10011001100 @VX_n + VCFUGED 000100 ..... ..... ..... 10101001101 @VX VCLZDM 000100 ..... ..... ..... 11110000100 @VX VCTZDM 000100 ..... ..... ..... 11111000100 @VX diff --git a/target/ppc/translate/vmx-impl.c.inc b/target/ppc/translate/vmx-impl.c.inc index 78b277466a..43eb7ab70c 100644 --- a/target/ppc/translate/vmx-impl.c.inc +++ b/target/ppc/translate/vmx-impl.c.inc @@ -1438,6 +1438,50 @@ GEN_VXFORM_DUAL(vsplth, PPC_ALTIVEC, PPC_NONE, GEN_VXFORM_DUAL(vspltw, PPC_ALTIVEC, PPC_NONE, vextractuw, PPC_NONE, PPC2_ISA300); +static bool trans_VGNB(DisasContext *ctx, arg_VX_n *a) +{ + TCGv_i64 vrb, tmp, rt; + int in = 63, out = 63; + + REQUIRE_INSNS_FLAGS2(ctx, ISA310); + REQUIRE_VECTOR(ctx); + + if (a->n < 2) { + /* + * "N can be any value between 2 and 7, inclusive." Otherwise, the + * result is undefined, so we don't need to change RT. Also, N > 7 is + * impossible since the immediate field is 3 bits only. + */ + return true; + } + + vrb = tcg_temp_new_i64(); + tmp = tcg_temp_new_i64(); + rt = tcg_const_i64(0); + + for (int dw = 1; dw >= 0; dw--) { + get_avr64(vrb, a->vrb, dw); + for (; in >= 0; in -= a->n, out--) { + if (in > out) { + tcg_gen_shri_i64(tmp, vrb, in - out); + } else { + tcg_gen_shli_i64(tmp, vrb, out - in); + } + tcg_gen_andi_i64(tmp, tmp, 1ULL << out); + tcg_gen_or_i64(rt, rt, tmp); + } + in += 64; + } + + tcg_gen_trunc_i64_tl(cpu_gpr[a->rt], rt); + + tcg_temp_free_i64(vrb); + tcg_temp_free_i64(tmp); + tcg_temp_free_i64(rt); + + return true; +} + static bool do_vextdx(DisasContext *ctx, arg_VA *a, int size, bool right, void (*gen_helper)(TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv_ptr, TCGv)) {