diff mbox series

[RFC,19/42] target/mips/tx79: Introduce PCEQ* opcodes (Parallel Compare for Equal)

Message ID 20210214175912.732946-20-f4bug@amsat.org (mailing list archive)
State New, archived
Headers show
Series target/mips: Reintroduce the R5900 CPU (with more testing) | expand

Commit Message

Philippe Mathieu-Daudé Feb. 14, 2021, 5:58 p.m. UTC
Introduce the 'Parallel Compare for Equal' opcodes:

 - PCEQB (Parallel Compare for Equal Byte)
 - PCEQH (Parallel Compare for Equal Halfword)
 - PCEQW (Parallel Compare for Equal Word)

Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
---
 target/mips/tx79.decode      |  3 ++
 target/mips/tx79_translate.c | 66 ++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

Comments

Richard Henderson Feb. 15, 2021, 8:32 p.m. UTC | #1
On 2/14/21 9:58 AM, Philippe Mathieu-Daudé wrote:
> +static bool trans_parallel_compare(DisasContext *ctx, arg_rtype *a,
> +                                   TCGCond cond, unsigned wlen)
> +{
> +    TCGv_i64 c0, c1, ax, bx, t0, t1, t2;
> +
> +    if (a->rd == 0) {
> +        /* nop */
> +        return true;
> +    }
> +
> +    c0 = tcg_const_tl(0);
> +    c1 = tcg_const_tl(0xffffffff);

Cheaper for most hosts to load -1 than a 32-bit value zero-extended to 64 bits.
 That said, you could also use

  setcond(t0, t0, t1, cond);
  neg(t0, t0);

> +    for (int i = 0; i < (64 / wlen); i++) {
> +        tcg_gen_sextract_i64(t0, ax, wlen * i, wlen);
> +        tcg_gen_sextract_i64(t1, bx, wlen * i, wlen);
> +        tcg_gen_movcond_i64(cond, t2, t1, t0, c1, c0);
> +        tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rd], t2, wlen * i, wlen);
> +    }

For an accumulate loop like this, we'll get better results if the length of the
insert is the remaining length of the register.  That way, the first insert is
width 64, which turns into a move, so that the old value of rd is not used.
Further, we can use extract2 to replace the remaining length when deposit is
not available.

Also, while you will need this compare loop for GT, there's a cheaper way to
compute EQ, which we use in several places in QEMU.

void gen_pceq(TCGv_i64 d, TCGv_i64 s, TCGv_i64 t, MemOp esz)
{
  TCGv_i64 one = tcg_constant_i64(dup_const(esz, 1));
  TCGv_i64 x = tcg_temp_new_i64();

  /* Turn s == t into x == 0. */
  tcg_gen_xor_i64(x, s, t);

  /*
   * See hasless(v,1) from
   * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
   * Shift the msb down, then use muli to replicate
   * the one bit across the vector element.
   */
  tcg_gen_sub_i64(d, x, one);
  tcg_gen_andc_i64(d, d, x);
  tcg_gen_shri_i64(d, d, (8 << esz) - 1);
  tcg_gen_and_i64(d, d, one);
  tcg_gen_muli_i64(d, d, MAKE_64BIT_MASK(0, 8 << esz));

  tcg_temp_free_i64(x);
}

In both cases, I think you should pull out helper functions and then use
trans_parallel_logic.


r~
diff mbox series

Patch

diff --git a/target/mips/tx79.decode b/target/mips/tx79.decode
index 98f21d33e3f..cfe721755ca 100644
--- a/target/mips/tx79.decode
+++ b/target/mips/tx79.decode
@@ -40,6 +40,9 @@  PEXTLB          011100 ..... ..... ..... 11010 001000   @rs_rt_rd
 
 # MMI1
 
+PCEQW           011100 ..... ..... ..... 00010 101000   @rs_rt_rd
+PCEQH           011100 ..... ..... ..... 00110 101000   @rs_rt_rd
+PCEQB           011100 ..... ..... ..... 01010 101000   @rs_rt_rd
 PEXTUW          011100 ..... ..... ..... 10010 101000   @rs_rt_rd
 
 # MMI2
diff --git a/target/mips/tx79_translate.c b/target/mips/tx79_translate.c
index 11968d6edab..f084faa48a7 100644
--- a/target/mips/tx79_translate.c
+++ b/target/mips/tx79_translate.c
@@ -279,6 +279,72 @@  static bool trans_PNOR(DisasContext *ctx, arg_rtype *a)
  * PCEQW   rd, rs, rt        Parallel Compare for Equal Word
  */
 
+static bool trans_parallel_compare(DisasContext *ctx, arg_rtype *a,
+                                   TCGCond cond, unsigned wlen)
+{
+    TCGv_i64 c0, c1, ax, bx, t0, t1, t2;
+
+    if (a->rd == 0) {
+        /* nop */
+        return true;
+    }
+
+    c0 = tcg_const_tl(0);
+    c1 = tcg_const_tl(0xffffffff);
+    ax = tcg_temp_new_i64();
+    bx = tcg_temp_new_i64();
+    t0 = tcg_temp_new_i64();
+    t1 = tcg_temp_new_i64();
+    t2 = tcg_temp_new_i64();
+
+    /* Lower halve */
+    gen_load_gpr(ax, a->rs);
+    gen_load_gpr(bx, a->rt);
+    for (int i = 0; i < (64 / wlen); i++) {
+        tcg_gen_sextract_i64(t0, ax, wlen * i, wlen);
+        tcg_gen_sextract_i64(t1, bx, wlen * i, wlen);
+        tcg_gen_movcond_i64(cond, t2, t1, t0, c1, c0);
+        tcg_gen_deposit_i64(cpu_gpr[a->rd], cpu_gpr[a->rd], t2, wlen * i, wlen);
+    }
+    /* Upper halve */
+    gen_load_gpr_hi(ax, a->rs);
+    gen_load_gpr_hi(bx, a->rt);
+    for (int i = 0; i < (64 / wlen); i++) {
+        tcg_gen_sextract_i64(t0, ax, wlen * i, wlen);
+        tcg_gen_sextract_i64(t1, bx, wlen * i, wlen);
+        tcg_gen_movcond_i64(cond, t2, t1, t0, c1, c0);
+        tcg_gen_deposit_i64(cpu_gpr_hi[a->rd], cpu_gpr_hi[a->rd], t2, wlen * i, wlen);
+    }
+
+    tcg_temp_free(t2);
+    tcg_temp_free(t1);
+    tcg_temp_free(t0);
+    tcg_temp_free(bx);
+    tcg_temp_free(ax);
+    tcg_temp_free(c1);
+    tcg_temp_free(c0);
+
+    return true;
+}
+
+/* Parallel Compare for Equal Byte */
+static bool trans_PCEQB(DisasContext *ctx, arg_rtype *a)
+{
+    return trans_parallel_compare(ctx, a, TCG_COND_EQ, 8);
+}
+
+/* Parallel Compare for Equal Halfword */
+static bool trans_PCEQH(DisasContext *ctx, arg_rtype *a)
+{
+    return trans_parallel_compare(ctx, a, TCG_COND_EQ, 16);
+}
+
+/* Parallel Compare for Equal Word */
+static bool trans_PCEQW(DisasContext *ctx, arg_rtype *a)
+{
+    return trans_parallel_compare(ctx, a, TCG_COND_EQ, 32);
+}
+
 /*
  *     LZC (1 instruction)
  *     -------------------