diff mbox series

[v3,09/30] tcg/loongarch64: Implement tcg_out_mov and tcg_out_movi

Message ID 20210922180927.666273-10-git@xen0n.name (mailing list archive)
State New, archived
Headers show
Series LoongArch64 port of QEMU TCG | expand

Commit Message

WANG Xuerui Sept. 22, 2021, 6:09 p.m. UTC
Signed-off-by: WANG Xuerui <git@xen0n.name>
---
 tcg/loongarch64/tcg-target.c.inc | 89 ++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)

Comments

Richard Henderson Sept. 22, 2021, 6:39 p.m. UTC | #1
On 9/22/21 11:09 AM, WANG Xuerui wrote:
> +    if (pc_offset == (int32_t)pc_offset) {
> +        tcg_target_long lo = sextreg(pc_offset, 0, 12);
> +        tcg_target_long hi = pc_offset - lo;
> +        tcg_out_opc_pcaddu12i(s, rd, hi >> 12);
> +        tcg_out_opc_addi_d(s, rd, rd, lo);

pc_offset = 0x7ffff800 will fail:

lo = 0xfffffffffffff800
hi = 0x0000000080000000

but hi will be interpreted as negative by pcaddu12i.

This is the same problem I pointed out with tcg_out_call, but with different constants.


r~
Richard Henderson Sept. 22, 2021, 6:51 p.m. UTC | #2
On 9/22/21 11:09 AM, WANG Xuerui wrote:
> +    if (sextreg(val, 0, 52) == val) {
> +        /*
> +         * Fits in 52-bits, upper bits are already properly sign-extended by
> +         * cu32i.d.
> +         */
> +        return;
> +    }
> +    tcg_out_opc_cu52i_d(s, rd, rd, top);
> +}

Oh, future improvement: constants with 52 low zero bits can be loaded with cu52i(rd, zero, 
val >> 52).

Which means there's a set of interesting constants:

   abc0_0000_0000_0def

	ori	rd, zero, 0xdef
	cu52i	rd, rd, 0xabc

   abcf_ffff_ffff_fdef

	cu52i	rd, zero, 0xabc - 1
	addi.d	rd, rd, 0xdef

Also,

> +    tcg_out_opc_lu12i_w(s, rd, upper);
> +    if (low != 0) {
> +        tcg_out_opc_ori(s, rd, rd, low & 0xfff);
> +    }

when upper == 0 and low != 0, we can omit the lu12i.


r~
WANG Xuerui Sept. 22, 2021, 7:02 p.m. UTC | #3
Hi Richard,

On 9/23/21 02:39, Richard Henderson wrote:
> On 9/22/21 11:09 AM, WANG Xuerui wrote:
>> +    if (pc_offset == (int32_t)pc_offset) {
>> +        tcg_target_long lo = sextreg(pc_offset, 0, 12);
>> +        tcg_target_long hi = pc_offset - lo;
>> +        tcg_out_opc_pcaddu12i(s, rd, hi >> 12);
>> +        tcg_out_opc_addi_d(s, rd, rd, lo);
>
> pc_offset = 0x7ffff800 will fail:
>
> lo = 0xfffffffffffff800
> hi = 0x0000000080000000
>
> but hi will be interpreted as negative by pcaddu12i.
>
> This is the same problem I pointed out with tcg_out_call, but with 
> different constants.
>
Hmm, I think I'll have to look into this, but not now (it's again 3 am 
here)... I'll try to come up with something tomorrow.
>
> r~
WANG Xuerui Sept. 23, 2021, 3:38 p.m. UTC | #4
Hi Richard,

On 9/23/21 02:51, Richard Henderson wrote:
> On 9/22/21 11:09 AM, WANG Xuerui wrote:
>> +    if (sextreg(val, 0, 52) == val) {
>> +        /*
>> +         * Fits in 52-bits, upper bits are already properly 
>> sign-extended by
>> +         * cu32i.d.
>> +         */
>> +        return;
>> +    }
>> +    tcg_out_opc_cu52i_d(s, rd, rd, top);
>> +}
>
> Oh, future improvement: constants with 52 low zero bits can be loaded 
> with cu52i(rd, zero, val >> 52).
>
> Which means there's a set of interesting constants:
>
>   abc0_0000_0000_0def
>
>     ori    rd, zero, 0xdef
>     cu52i    rd, rd, 0xabc
>
>   abcf_ffff_ffff_fdef
>
>     cu52i    rd, zero, 0xabc - 1
>     addi.d    rd, rd, 0xdef
I think I'll try to implement this in some kind of follow-up patch, yeah.
>
> Also,
>
>> +    tcg_out_opc_lu12i_w(s, rd, upper);
>> +    if (low != 0) {
>> +        tcg_out_opc_ori(s, rd, rd, low & 0xfff);
>> +    }
>
> when upper == 0 and low != 0, we can omit the lu12i.
>
This optimization can't be blindly implemented though; because cu32i.d 
only takes one input register, in case upper == low == 0 but higher != 
0, we would have no proper input. So in that case something like "move 
rd, zero" is necessary, and logic is a little bit complicated. I'll 
include this in v4 though.
>
> r~
Richard Henderson Sept. 23, 2021, 4:50 p.m. UTC | #5
On 9/22/21 11:09 AM, WANG Xuerui wrote:

Following up on previous, I suggest:

> +static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
> +                         tcg_target_long val)
> +{
> +    if (type == TCG_TYPE_I32) {
> +        val = (int32_t)val;
> +    }
> +
> +    /* Single-instruction cases.  */
> +    tcg_target_long low = sextreg(val, 0, 12);
> +    if (low == val) {
> +        /* val fits in simm12: addi.w rd, zero, val */
> +        tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, val);
> +        return;
> +    }
> +    if (0x800 <= val && val <= 0xfff) {
> +        /* val fits in uimm12: ori rd, zero, val */
> +        tcg_out_opc_ori(s, rd, TCG_REG_ZERO, val);
> +        return;
> +    }

> +    /* Test for PC-relative values that can be loaded faster.  */
> +    intptr_t pc_offset = tcg_pcrel_diff(s, (void *)val);
> +    if (pc_offset == sextreg(pc_offset, 0, 22) && (pc_offset & 3) == 0) {
> +        tcg_out_opc_pcaddu2i(s, rd, pc_offset >> 2);
> +        return;
> +    }

     /* Handle all 32-bit constants. */
     if (val == (int32_t)val) {
         tcg_out_opc_lu12i(s, rd, val >> 12);
         if (low) {
             tcg_out_opc_ori(s, rd, rd, val & 0xfff);
         }
         return;
     }

     /* Handle pc-relative values requiring 2 instructions. */
     intptr_t pc_lo = sextract64(pc_offset, 0, 12);
     intptr_t pc_hi = pc_offset - pc_low;
     if (pc_hi == (int32_t)pc_hi) {
         tcg_out_opc_pcaddu12i(s, rd, pc_hi >> 12);
         tcg_out_opc_addi_d(s, rd, rd, pc_lo);
         return;
     }

     /*
      * Choose signed low part if bit 13 is also set,
      * which gives us a chance of making more zeros.
      * Otherwise, let low be unsigned.
      */
     if ((val & 0x1800) != 0x1800) {
         low = val & 0xfff;
     }
     val -= low;

     tcg_target_long hi20 = sextract64(val, 12, 20);
     tcg_target_long hi32 = sextract64(val, 32, 20);
     tcg_target_long hi52 = sextract64(val, 52, 12);

     /*
      * If we can use the sign-extension of a previous
      * operation, suppress higher -1.
      */
     if (hi32 < 0 && hi52 == -1) {
         hi52 = 0;
     }
     if (hi20 < 0 && hi32 == -1) {
         hi32 = 0;
     }

     /* Initialize RD with the least non-zero component. */
     if (hi20) {
         tcg_out_opc_lu12i_w(s, rd, hi20 >> 12);
     } else if (hi32) {
         /* CU32I_D is modify in place, so RD must be initialized. */
         if (low < 0) {
             tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, low);
         } else {
             tcg_out_opc_ori(s, rd, TCG_REG_ZERO, low);
         }
         low = 0;
     } else {
         tcg_out_opc_cu52i_d(s, rd, TCG_REG_ZERO, hi52);
         hi52 = 0;
     }

     /* Assume that lu12i + ori are fusable */
     if (low > 0) {
         tcg_out_opc_ori(s, rd, rd, low);
     }

     /* Set the high 32 bits */
     if (hi32) {
         tcg_out_opc_cu32i_d(s, rd, hi32);
     }
     if (hi52) {
         tcg_out_opc_cu52i(s, rd, rd, hi52);
     }

     /*
      * Note that any subtraction must come last,
      * because cu32i and cu52i overwrite high bits,
      * and we have computed them as val - low.
      */
     if (low < 0) {
         tcg_out_opc_addi_d(s, rd, rd, low);
     }

Untested, and all bugs are mine, of course.

Try "qemu-system-ppc64 -D z -d in_asm,op_opt,out_asm".
You should see some masking constants like

  ---- 000000001daf2898
  and_i64 CA,r9,$0x7fffffffffffffff        dead: 2  pref=0xffff

   cu52i.d rd, zero, 0x800
   addi.d  rd, rd, -1

  ---- 000000001db0775c
  mov_i64 r26,$0x300000002                 sync: 0  dead: 0 1  pref=0xffff

   ori     rd, zero, 2
   cu32i   rd, 3


r~
WANG Xuerui Sept. 24, 2021, 3:08 p.m. UTC | #6
Hi Richard,

On 9/24/21 00:50, Richard Henderson wrote:
> On 9/22/21 11:09 AM, WANG Xuerui wrote:
>
> Following up on previous, I suggest:
>
>> +static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
>> +                         tcg_target_long val)
>> +{
>> +    if (type == TCG_TYPE_I32) {
>> +        val = (int32_t)val;
>> +    }
>> +
>> +    /* Single-instruction cases.  */
>> +    tcg_target_long low = sextreg(val, 0, 12);
>> +    if (low == val) {
>> +        /* val fits in simm12: addi.w rd, zero, val */
>> +        tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, val);
>> +        return;
>> +    }
>> +    if (0x800 <= val && val <= 0xfff) {
>> +        /* val fits in uimm12: ori rd, zero, val */
>> +        tcg_out_opc_ori(s, rd, TCG_REG_ZERO, val);
>> +        return;
>> +    }
>
>> +    /* Test for PC-relative values that can be loaded faster.  */
>> +    intptr_t pc_offset = tcg_pcrel_diff(s, (void *)val);
>> +    if (pc_offset == sextreg(pc_offset, 0, 22) && (pc_offset & 3) == 
>> 0) {
>> +        tcg_out_opc_pcaddu2i(s, rd, pc_offset >> 2);
>> +        return;
>> +    }
>
>     /* Handle all 32-bit constants. */
>     if (val == (int32_t)val) {
>         tcg_out_opc_lu12i(s, rd, val >> 12);
>         if (low) {
>             tcg_out_opc_ori(s, rd, rd, val & 0xfff);
>         }
>         return;
>     }
>
>     /* Handle pc-relative values requiring 2 instructions. */
>     intptr_t pc_lo = sextract64(pc_offset, 0, 12);
>     intptr_t pc_hi = pc_offset - pc_low;
>     if (pc_hi == (int32_t)pc_hi) {
>         tcg_out_opc_pcaddu12i(s, rd, pc_hi >> 12);
>         tcg_out_opc_addi_d(s, rd, rd, pc_lo);
>         return;
>     }
>
>     /*
>      * Choose signed low part if bit 13 is also set,
>      * which gives us a chance of making more zeros.
>      * Otherwise, let low be unsigned.
>      */
>     if ((val & 0x1800) != 0x1800) {
>         low = val & 0xfff;
>     }
>     val -= low;
>
>     tcg_target_long hi20 = sextract64(val, 12, 20);
>     tcg_target_long hi32 = sextract64(val, 32, 20);
>     tcg_target_long hi52 = sextract64(val, 52, 12);
>
>     /*
>      * If we can use the sign-extension of a previous
>      * operation, suppress higher -1.
>      */
>     if (hi32 < 0 && hi52 == -1) {
>         hi52 = 0;
>     }
>     if (hi20 < 0 && hi32 == -1) {
>         hi32 = 0;
>     }
>
>     /* Initialize RD with the least non-zero component. */
>     if (hi20) {
>         tcg_out_opc_lu12i_w(s, rd, hi20 >> 12);
>     } else if (hi32) {
>         /* CU32I_D is modify in place, so RD must be initialized. */
>         if (low < 0) {
>             tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, low);
>         } else {
>             tcg_out_opc_ori(s, rd, TCG_REG_ZERO, low);
>         }
>         low = 0;
>     } else {
>         tcg_out_opc_cu52i_d(s, rd, TCG_REG_ZERO, hi52);
>         hi52 = 0;
>     }
>
>     /* Assume that lu12i + ori are fusable */
>     if (low > 0) {
>         tcg_out_opc_ori(s, rd, rd, low);
>     }
>
>     /* Set the high 32 bits */
>     if (hi32) {
>         tcg_out_opc_cu32i_d(s, rd, hi32);
>     }
>     if (hi52) {
>         tcg_out_opc_cu52i(s, rd, rd, hi52);
>     }
>
>     /*
>      * Note that any subtraction must come last,
>      * because cu32i and cu52i overwrite high bits,
>      * and we have computed them as val - low.
>      */
>     if (low < 0) {
>         tcg_out_opc_addi_d(s, rd, rd, low);
>     }
>
> Untested, and all bugs are mine, of course.
>
> Try "qemu-system-ppc64 -D z -d in_asm,op_opt,out_asm".
> You should see some masking constants like
>
>  ---- 000000001daf2898
>  and_i64 CA,r9,$0x7fffffffffffffff        dead: 2  pref=0xffff
>
>   cu52i.d rd, zero, 0x800
>   addi.d  rd, rd, -1
>
>  ---- 000000001db0775c
>  mov_i64 r26,$0x300000002                 sync: 0  dead: 0 1 pref=0xffff
>
>   ori     rd, zero, 2
>   cu32i   rd, 3
>
Oops, for some reason I only received this at about 8 pm... I'll of 
course take advantage of the Saturday and compare the generated code for 
the cases, hopefully incorporating some of your ideas presented here. 
Thanks for the detailed reply!
>
> r~
Richard Henderson Sept. 24, 2021, 3:53 p.m. UTC | #7
On 9/24/21 11:08 AM, WANG Xuerui wrote:
> Oops, for some reason I only received this at about 8 pm...

That was my fault.  I wrote a bunch of stuff off-line yesterday while traveling, and the 
mail queue only flushed this morning.

I'll note there's a bug in my example code wrt initializing rd with addi, then overwriting 
with cu32i.d.

I like your v4 version of movi, with the high-bit-set predicate.  The only case I can 
think of that you miss is e.g. 0x7fffffffffffffff, which can be

	addi.w	rd, zero, -1
	cu52i.d	rd, rd, 0x7ff

One possibility is to extract a subroutine:

static void tcg_out_movi_i32(TCGContext *s, TCGReg rd, int32_t val)
{
     /* Single instruction cases */
     /* else lu12i.w + ori */
}

static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
                          tcg_target_long val)
{
     if (type == TCG_TYPE_I32 || val == (int32_t)val) {
         tcg_out_movi_i32(s, rd, val);
         return;
     }

     /* PC-relative cases */

     if (ctz64(val) >= 52) {
         tcg_out_opc_cu52i_d(s, rd, TCG_REG_ZERO, val >> 52);
         return;
     }

     /* Slow path.  Initialize the low 32-bits, then concat high bits. */
     tcg_out_movi_i32(s, rd, val);

     rd_high_bits_are_ones = (int32_t)val < 0);

     /* Your imm_part_needs_loading checks; rd is always written. */
}


r~
WANG Xuerui Sept. 24, 2021, 4:26 p.m. UTC | #8
Hi Richard,

On 9/24/21 23:53, Richard Henderson wrote:
> On 9/24/21 11:08 AM, WANG Xuerui wrote:
>> Oops, for some reason I only received this at about 8 pm...
>
> That was my fault.  I wrote a bunch of stuff off-line yesterday while 
> traveling, and the mail queue only flushed this morning.
>
> I'll note there's a bug in my example code wrt initializing rd with 
> addi, then overwriting with cu32i.d.
>
> I like your v4 version of movi, with the high-bit-set predicate. The 
> only case I can think of that you miss is e.g. 0x7fffffffffffffff, 
> which can be
>
>     addi.w    rd, zero, -1
>     cu52i.d    rd, rd, 0x7ff
>
> One possibility is to extract a subroutine:
>
> static void tcg_out_movi_i32(TCGContext *s, TCGReg rd, int32_t val)
> {
>     /* Single instruction cases */
>     /* else lu12i.w + ori */
> }
>
> static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
>                          tcg_target_long val)
> {
>     if (type == TCG_TYPE_I32 || val == (int32_t)val) {
>         tcg_out_movi_i32(s, rd, val);
>         return;
>     }
>
>     /* PC-relative cases */
>
>     if (ctz64(val) >= 52) {
>         tcg_out_opc_cu52i_d(s, rd, TCG_REG_ZERO, val >> 52);
>         return;
>     }
>
>     /* Slow path.  Initialize the low 32-bits, then concat high bits. */
>     tcg_out_movi_i32(s, rd, val);
>
>     rd_high_bits_are_ones = (int32_t)val < 0);
>
>     /* Your imm_part_needs_loading checks; rd is always written. */
> }
>
This is so impressive understanding of the LoongArch assembly, working 
around cu32i.d's single operand limitation so nicely. I'll just 
(shamelessly) take this for v5, thanks again!
>
> r~
diff mbox series

Patch

diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc
index 338b772732..6d28a29070 100644
--- a/tcg/loongarch64/tcg-target.c.inc
+++ b/tcg/loongarch64/tcg-target.c.inc
@@ -247,6 +247,93 @@  static void tcg_out_mb(TCGContext *s, TCGArg a0)
     tcg_out_opc_dbar(s, 0);
 }
 
+static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
+{
+    if (ret == arg) {
+        return true;
+    }
+    switch (type) {
+    case TCG_TYPE_I32:
+    case TCG_TYPE_I64:
+        /*
+         * Conventional register-register move used in LoongArch is
+         * `or dst, src, zero`.
+         */
+        tcg_out_opc_or(s, ret, arg, TCG_REG_ZERO);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+    return true;
+}
+
+static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
+                         tcg_target_long val)
+{
+    if (type == TCG_TYPE_I32) {
+        val = (int32_t)val;
+    }
+
+    /* Single-instruction cases.  */
+    tcg_target_long low = sextreg(val, 0, 12);
+    if (low == val) {
+        /* val fits in simm12: addi.w rd, zero, val */
+        tcg_out_opc_addi_w(s, rd, TCG_REG_ZERO, val);
+        return;
+    }
+    if (0x800 <= val && val <= 0xfff) {
+        /* val fits in uimm12: ori rd, zero, val */
+        tcg_out_opc_ori(s, rd, TCG_REG_ZERO, val);
+        return;
+    }
+
+    /* Test for PC-relative values that can be loaded faster.  */
+    intptr_t pc_offset = tcg_pcrel_diff(s, (void *)val);
+    if (pc_offset == sextreg(pc_offset, 0, 22) && (pc_offset & 3) == 0) {
+        tcg_out_opc_pcaddu2i(s, rd, pc_offset >> 2);
+        return;
+    }
+    if (pc_offset == (int32_t)pc_offset) {
+        tcg_target_long lo = sextreg(pc_offset, 0, 12);
+        tcg_target_long hi = pc_offset - lo;
+        tcg_out_opc_pcaddu12i(s, rd, hi >> 12);
+        tcg_out_opc_addi_d(s, rd, rd, lo);
+        return;
+    }
+
+    /*
+     * Slow path: at most lu12i.w + ori + cu32i.d + cu52i.d.
+     *
+     * Chop upper bits into 3 immediate-field-sized segments respectively.
+     */
+    tcg_target_long upper = sextreg(val, 12, 20);
+    tcg_target_long higher = sextreg(val, 32, 20);
+    tcg_target_long top = sextreg(val, 52, 12);
+
+    tcg_out_opc_lu12i_w(s, rd, upper);
+    if (low != 0) {
+        tcg_out_opc_ori(s, rd, rd, low & 0xfff);
+    }
+
+    if (sextreg(val, 0, 32) == val) {
+        /*
+         * Fits in 32-bits, upper bits are already properly sign-extended by
+         * lu12i.w.
+         */
+        return;
+    }
+    tcg_out_opc_cu32i_d(s, rd, higher);
+
+    if (sextreg(val, 0, 52) == val) {
+        /*
+         * Fits in 52-bits, upper bits are already properly sign-extended by
+         * cu32i.d.
+         */
+        return;
+    }
+    tcg_out_opc_cu52i_d(s, rd, rd, top);
+}
+
 /*
  * Entry-points
  */
@@ -262,6 +349,8 @@  static void tcg_out_op(TCGContext *s, TCGOpcode opc,
         tcg_out_mb(s, a0);
         break;
 
+    case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
+    case INDEX_op_mov_i64:
     default:
         g_assert_not_reached();
     }