[14/15] tcg-mips: Use mipsr6 instructions in branches
diff mbox

Message ID 1455014403-10742-15-git-send-email-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson Feb. 9, 2016, 10:40 a.m. UTC
Using compact branches, when possible, avoids a delay slot nop.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 include/elf.h         |   4 +
 tcg/mips/tcg-target.c | 216 +++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 172 insertions(+), 48 deletions(-)

Comments

James Hogan Feb. 9, 2016, 4:22 p.m. UTC | #1
Hi Richard,

On Tue, Feb 09, 2016 at 09:40:02PM +1100, Richard Henderson wrote:
> Using compact branches, when possible, avoids a delay slot nop.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  include/elf.h         |   4 +
>  tcg/mips/tcg-target.c | 216 +++++++++++++++++++++++++++++++++++++++-----------
>  2 files changed, 172 insertions(+), 48 deletions(-)
> 
> diff --git a/include/elf.h b/include/elf.h
> index 1098d21..6e52ba0 100644
> --- a/include/elf.h
> +++ b/include/elf.h
> @@ -352,6 +352,10 @@ typedef struct {
>  #define R_MIPS_CALLHI16		30
>  #define R_MIPS_CALLLO16		31
>  /*
> + * Incomplete list of MIPS R6 relocation types.
> + */
> +#define R_MIPS_PC26_S2          61
> +/*
>   * This range is reserved for vendor specific relocations.
>   */
>  #define R_MIPS_LOVENDOR		100
> diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c
> index e0972ba..06e15d4 100644
> --- a/tcg/mips/tcg-target.c
> +++ b/tcg/mips/tcg-target.c
> @@ -152,6 +152,19 @@ static inline void reloc_pc16(tcg_insn_unit *pc, tcg_insn_unit *target)
>      *pc = deposit32(*pc, 0, 16, reloc_pc16_val(pc, target));
>  }
>  
> +static inline uint32_t reloc_pc26_val(tcg_insn_unit *pc, tcg_insn_unit *target)
> +{
> +    /* Let the compiler perform the right-shift as part of the arithmetic.  */
> +    ptrdiff_t disp = target - (pc + 1);
> +    assert(disp == sextract32(disp, 0, 26));
> +    return disp & 0x1ffffff;
> +}
> +
> +static inline void reloc_pc26(tcg_insn_unit *pc, tcg_insn_unit *target)
> +{
> +    *pc = deposit32(*pc, 0, 26, reloc_pc16_val(pc, target));
> +}
> +
>  static inline uint32_t reloc_26_val(tcg_insn_unit *pc, tcg_insn_unit *target)
>  {
>      assert((((uintptr_t)pc ^ (uintptr_t)target) & 0xf0000000) == 0);
> @@ -166,9 +179,17 @@ static inline void reloc_26(tcg_insn_unit *pc, tcg_insn_unit *target)
>  static void patch_reloc(tcg_insn_unit *code_ptr, int type,
>                          intptr_t value, intptr_t addend)
>  {
> -    assert(type == R_MIPS_PC16);
>      assert(addend == 0);
> -    reloc_pc16(code_ptr, (tcg_insn_unit *)value);
> +    switch (type) {
> +    case R_MIPS_PC16:
> +        reloc_pc16(code_ptr, (tcg_insn_unit *)value);
> +	break;
> +    case R_MIPS_PC26_S2:
> +        reloc_pc26(code_ptr, (tcg_insn_unit *)value);
> +	break;
> +    default:
> +        tcg_abort();
> +    }
>  }
>  
>  #define TCG_CT_CONST_ZERO 0x100
> @@ -309,7 +330,10 @@ typedef enum {
>      OPC_BEQ      = 004 << 26,
>      OPC_BNE      = 005 << 26,
>      OPC_BLEZ     = 006 << 26,
> +    OPC_BGEUC    = OPC_BLEZ,    /* R6: rs != 0, rt != 0, rs != rt */
>      OPC_BGTZ     = 007 << 26,
> +    OPC_BLTUC    = OPC_BGTZ,    /* R6: rs != 0, rt != 0, rs != rt */
> +    OPC_BEQC     = 010 << 26,   /* R6: rs > rt */
>      OPC_ADDIU    = 011 << 26,
>      OPC_SLTI     = 012 << 26,
>      OPC_SLTIU    = 013 << 26,
> @@ -318,6 +342,9 @@ typedef enum {
>      OPC_XORI     = 016 << 26,
>      OPC_LUI      = 017 << 26,
>      OPC_AUI      = OPC_LUI,
> +    OPC_BGEC     = 026 << 26,
> +    OPC_BLTC     = 027 << 26,
> +    OPC_BNEC     = 030 << 26,   /* R6: rs > rt */
>      OPC_DADDIU   = 031 << 26,
>      OPC_DAUI     = 035 << 26,
>      OPC_LB       = 040 << 26,
> @@ -329,6 +356,7 @@ typedef enum {
>      OPC_SB       = 050 << 26,
>      OPC_SH       = 051 << 26,
>      OPC_SW       = 053 << 26,
> +    OPC_BC       = 062 << 26,
>      OPC_LD       = 067 << 26,
>      OPC_SD       = 077 << 26,
>  
> @@ -527,6 +555,17 @@ static inline void tcg_out_opc_br(TCGContext *s, MIPSInsn opc,
>      tcg_out_opc_imm(s, opc, rt, rs, offset);
>  }
>  
> +static void tcg_out_opc_br_pc16(TCGContext *s, MIPSInsn opc,
> +                                TCGReg arg1, TCGReg arg2, TCGLabel *l)
> +{
> +    tcg_out_opc_br(s, opc, arg1, arg2);
> +    if (l->has_value) {
> +        reloc_pc16(s->code_ptr - 1, l->u.value_ptr);
> +    } else {
> +        tcg_out_reloc(s, s->code_ptr - 1, R_MIPS_PC16, l, 0);
> +    }
> +}
> +
>  /*
>   * Type sa
>   */
> @@ -1002,59 +1041,129 @@ static void tcg_out_brcond(TCGContext *s, TCGCond cond, TCGReg arg1,
>          [TCG_COND_GE] = OPC_BGEZ,
>      };
>  
> -    MIPSInsn s_opc = OPC_SLTU;
> -    MIPSInsn b_opc;
> -    int cmp_map;
> +    MIPSInsn b_opc = 0;
> +    bool compact = false;
> +    int cmp_map, t;
> +
> +    /* We shouldn't expect to have arg1 == arg2, as the TCG optimizer
> +       should have eliminated all such.  However, the R6 encodings do
> +       not allow this situation, so e.g. if the optimizer is disabled
> +       we must fall back to normal compares.  */
> +    if (use_mips32r6_instructions && arg1 != arg2) {
> +        switch (cond) {
> +        case TCG_COND_EQ:
> +        case TCG_COND_NE:
> +            if (arg1 < arg2) {
> +                t = arg1, arg1 = arg2, arg2 = t;
> +            }
> +            b_opc = cond == TCG_COND_EQ ? OPC_BEQC : OPC_BNEC;
> +            compact = true;
> +            break;
>  
> -    switch (cond) {
> -    case TCG_COND_EQ:
> -        b_opc = OPC_BEQ;
> -        break;
> -    case TCG_COND_NE:
> -        b_opc = OPC_BNE;
> -        break;
> +        case TCG_COND_LE:
> +        case TCG_COND_GT:
> +            if (arg1 == TCG_REG_ZERO) {
> +                break;
> +            }
> +            /* Swap arguments to turn LE to GE or GT to LT.
> +               This also produces BLEZC/BGTZC when arg2 = 0.  */
> +            t = arg1, arg1 = arg2, arg2 = t;
> +            b_opc = cond == TCG_COND_LE ? OPC_BGEC : OPC_BLTC;
> +            compact = true;
> +            break;
>  
> -    case TCG_COND_LT:
> -    case TCG_COND_GT:
> -    case TCG_COND_LE:
> -    case TCG_COND_GE:
> -        if (arg2 == 0) {
> -            b_opc = b_zero[cond];
> -            arg2 = arg1;
> -            arg1 = 0;
> +        case TCG_COND_GE:
> +        case TCG_COND_LT:
> +            if (arg1 == TCG_REG_ZERO) {
> +                break;
> +            }
> +            /* The encoding of BGEZC/BLTZC requires rs = rt.  */
> +            if (arg2 == TCG_REG_ZERO) {
> +                arg2 = arg1;
> +            }
> +            b_opc = cond == TCG_COND_GE ? OPC_BGEC : OPC_BLTC;
> +            compact = true;
>              break;
> -        }
> -        s_opc = OPC_SLT;
> -        /* FALLTHRU */
>  
> -    case TCG_COND_LTU:
> -    case TCG_COND_GTU:
> -    case TCG_COND_LEU:
> -    case TCG_COND_GEU:
> -        cmp_map = mips_cmp_map[cond];
> -        if (cmp_map & MIPS_CMP_SWAP) {
> -            TCGReg t = arg1;
> -            arg1 = arg2;
> -            arg2 = t;
> +        case TCG_COND_LEU:
> +            /* Swap arguments to turn LE to GE.  */
> +            t = arg1, arg1 = arg2, arg2 = t;
> +            /* FALLTHRU */
> +        case TCG_COND_GEU:
> +            b_opc = OPC_BGEUC;
> +            compact = true;
> +            break;
> +
> +        case TCG_COND_GTU:
> +            /* Swap arguments to turn GT to LT.  */
> +            t = arg1, arg1 = arg2, arg2 = t;
> +            /* FALLTHRU */
> +        case TCG_COND_LTU:
> +            b_opc = OPC_BLTUC;
> +            compact = true;
> +            break;
> +
> +        default:
> +            tcg_abort();
> +            break;
>          }
> -        tcg_out_opc_reg(s, s_opc, TCG_TMP0, arg1, arg2);
> -        b_opc = (cmp_map & MIPS_CMP_INV ? OPC_BEQ : OPC_BNE);
> -        arg1 = TCG_TMP0;
> -        arg2 = TCG_REG_ZERO;
> -        break;
> +    }
>  
> -    default:
> -        tcg_abort();
> -        break;
> +    if (b_opc == 0) {
> +        MIPSInsn s_opc = OPC_SLTU;
> +
> +        switch (cond) {
> +        case TCG_COND_EQ:
> +            b_opc = OPC_BEQ;
> +            break;
> +        case TCG_COND_NE:
> +            b_opc = OPC_BNE;
> +            break;
> +
> +        case TCG_COND_LT:
> +        case TCG_COND_GT:
> +        case TCG_COND_LE:
> +        case TCG_COND_GE:
> +            if (arg2 == 0) {
> +                b_opc = b_zero[cond];
> +                arg2 = arg1;
> +                arg1 = 0;
> +                break;
> +            }
> +            s_opc = OPC_SLT;
> +            /* FALLTHRU */
> +
> +        case TCG_COND_LTU:
> +        case TCG_COND_GTU:
> +        case TCG_COND_LEU:
> +        case TCG_COND_GEU:
> +            cmp_map = mips_cmp_map[cond];
> +            if (cmp_map & MIPS_CMP_SWAP) {
> +                TCGReg t = arg1;
> +                arg1 = arg2;
> +                arg2 = t;
> +            }
> +            tcg_out_opc_reg(s, s_opc, TCG_TMP0, arg1, arg2);
> +            if (use_mips32r6_instructions) {
> +                b_opc = (cmp_map & MIPS_CMP_INV ? OPC_BEQC : OPC_BNEC);
> +                compact = true;
> +            } else {
> +                b_opc = (cmp_map & MIPS_CMP_INV ? OPC_BEQ : OPC_BNE);
> +            }
> +            arg1 = TCG_TMP0;
> +            arg2 = TCG_REG_ZERO;
> +            break;
> +
> +        default:
> +            tcg_abort();
> +            break;
> +        }
>      }
>  
> -    tcg_out_opc_br(s, b_opc, arg1, arg2);
> -    if (l->has_value) {
> -        reloc_pc16(s->code_ptr - 1, l->u.value_ptr);
> -    } else {
> -        tcg_out_reloc(s, s->code_ptr - 1, R_MIPS_PC16, l, 0);
> +    tcg_out_opc_br_pc16(s, b_opc, arg1, arg2, l);
> +    if (!compact) {
> +        tcg_out_nop(s);

Unfortunately this isn't quite right. As far as I understand them,
conditional compact branches have a forbidden slot after them which
isn't permitted to contain a control transfer instruction (CTI).
Executing a conditional compact branch with a CTI in the forbidden slot
is required to signal a reserved instruction, but only if the branch is
not taken (giving user process a SIGILL).

E.g. 

Program received signal SIGILL, Illegal instruction.
[Switching to Thread 0xfff1c32e00 (LWP 204)]
0x000000fff30b0068 in code_gen_buffer ()
(gdb) disas/r
Dump of assembler code for function code_gen_buffer:
   0x000000fff30b0064 <+0>:     f8 ff 11 8e     lw      s1,-8(s0)
=> 0x000000fff30b0068 <+4>:     08 00 11 60     bnezalc s1,0xfff30b008c <code_gen_buffer+40>
   0x000000fff30b006c <+8>:     1d c0 c2 08     j       0xfff30b0074 <code_gen_buffer+16>
   0x000000fff30b0070 <+12>:    00 00 00 00     nop

(gdb) set *0x000000fff30b006c=0
(gdb) disas/r
Dump of assembler code for function code_gen_buffer:
   0x000000fff30b0064 <+0>:     f8 ff 11 8e     lw      s1,-8(s0)
=> 0x000000fff30b0068 <+4>:     08 00 11 60     bnezalc s1,0xfff30b008c <code_gen_buffer+40>
   0x000000fff30b006c <+8>:     00 00 00 00     nop
   0x000000fff30b0070 <+12>:    00 00 00 00     nop

(gdb) stepi
0x000000fff30b0070 in code_gen_buffer ()
(gdb) disas/r
Dump of assembler code for function code_gen_buffer:
   0x000000fff30b0064 <+0>:     f8 ff 11 8e     lw      s1,-8(s0)
   0x000000fff30b0068 <+4>:     08 00 11 60     bnezalc s1,0xfff30b008c <code_gen_buffer+40>
   0x000000fff30b006c <+8>:     00 00 00 00     nop
=> 0x000000fff30b0070 <+12>:    00 00 00 00     nop

So to be correct + efficient, it should only put the nop in if the next
generated instruction is a CTI. I imagine that would be a bit messy /
fragile, but maybe doable? I haven't looked too deeply.

Cheers
James


>      }
> -    tcg_out_nop(s);
>  }
>  
>  static TCGReg tcg_out_reduce_eq2(TCGContext *s, TCGReg tmp0, TCGReg tmp1,
> @@ -1826,8 +1935,19 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          s->tb_next_offset[a0] = tcg_current_code_size(s);
>          break;
>      case INDEX_op_br:
> -        tcg_out_brcond(s, TCG_COND_EQ, TCG_REG_ZERO, TCG_REG_ZERO,
> -                       arg_label(a0));
> +        {
> +            TCGLabel *l = arg_label(a0);
> +            if (use_mips32r6_instructions) {
> +                tcg_out32(s, OPC_BC);
> +                if (l->has_value) {
> +                    reloc_pc26(s->code_ptr - 1, l->u.value_ptr);
> +                } else {
> +                    tcg_out_reloc(s, s->code_ptr - 1, R_MIPS_PC26_S2, l, 0);
> +                }
> +            } else {
> +                tcg_out_opc_br_pc16(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO, l);
> +            }
> +        }
>          break;
>  
>      case INDEX_op_ld8u_i32:
> -- 
> 2.5.0
>
Richard Henderson Feb. 9, 2016, 5:13 p.m. UTC | #2
On 02/10/2016 03:22 AM, James Hogan wrote:
> So to be correct + efficient, it should only put the nop in if the next
> generated instruction is a CTI. I imagine that would be a bit messy /
> fragile, but maybe doable? I haven't looked too deeply.

Ouch, I didn't notice this about these insns.

I suppose this might be rare enough that it's still worth thinking about.  Off 
the top of my head I can't think of any way to save extra state, but perhaps 
just looking back at the previous insn's major opcode is enough when emitting 
any forbidden insn.

For the moment, let's just drop this patch (and probably the one for calls too, 
for the same reason?)


r~
Maciej W. Rozycki Feb. 9, 2016, 6:46 p.m. UTC | #3
On Tue, 9 Feb 2016, Richard Henderson wrote:

> > So to be correct + efficient, it should only put the nop in if the next
> > generated instruction is a CTI. I imagine that would be a bit messy /
> > fragile, but maybe doable? I haven't looked too deeply.
> 
> Ouch, I didn't notice this about these insns.
> 
> I suppose this might be rare enough that it's still worth thinking about.  Off
> the top of my head I can't think of any way to save extra state, but perhaps
> just looking back at the previous insn's major opcode is enough when emitting
> any forbidden insn.

 FWIW I think this is a reasonable approach, applying to the regular MIPS 
ISA where the size of the instruction word is fixed so you can look at the 
preceding instruction in a reproducible manner.  And in the microMIPSr6 
ISA there are no forbidden slots, so no issue there in the first place.

  Maciej
James Hogan Feb. 10, 2016, 12:20 a.m. UTC | #4
Hi Richard,

On Tue, Feb 09, 2016 at 04:22:34PM +0000, James Hogan wrote:
> (gdb) disas/r
> Dump of assembler code for function code_gen_buffer:
>    0x000000fff30b0064 <+0>:     f8 ff 11 8e     lw      s1,-8(s0)
> => 0x000000fff30b0068 <+4>:     08 00 11 60     bnezalc s1,0xfff30b008c <code_gen_buffer+40>

Note also that this seems to be the wrong encoding anyway. It is
encoding the "and-link" variation which overwrites $31 with PC+4.

Cheers
James


>    0x000000fff30b006c <+8>:     1d c0 c2 08     j       0xfff30b0074 <code_gen_buffer+16>
>    0x000000fff30b0070 <+12>:    00 00 00 00     nop
> 
> (gdb) set *0x000000fff30b006c=0
> (gdb) disas/r
> Dump of assembler code for function code_gen_buffer:
>    0x000000fff30b0064 <+0>:     f8 ff 11 8e     lw      s1,-8(s0)
> => 0x000000fff30b0068 <+4>:     08 00 11 60     bnezalc s1,0xfff30b008c <code_gen_buffer+40>
>    0x000000fff30b006c <+8>:     00 00 00 00     nop
>    0x000000fff30b0070 <+12>:    00 00 00 00     nop

Patch
diff mbox

diff --git a/include/elf.h b/include/elf.h
index 1098d21..6e52ba0 100644
--- a/include/elf.h
+++ b/include/elf.h
@@ -352,6 +352,10 @@  typedef struct {
 #define R_MIPS_CALLHI16		30
 #define R_MIPS_CALLLO16		31
 /*
+ * Incomplete list of MIPS R6 relocation types.
+ */
+#define R_MIPS_PC26_S2          61
+/*
  * This range is reserved for vendor specific relocations.
  */
 #define R_MIPS_LOVENDOR		100
diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c
index e0972ba..06e15d4 100644
--- a/tcg/mips/tcg-target.c
+++ b/tcg/mips/tcg-target.c
@@ -152,6 +152,19 @@  static inline void reloc_pc16(tcg_insn_unit *pc, tcg_insn_unit *target)
     *pc = deposit32(*pc, 0, 16, reloc_pc16_val(pc, target));
 }
 
+static inline uint32_t reloc_pc26_val(tcg_insn_unit *pc, tcg_insn_unit *target)
+{
+    /* Let the compiler perform the right-shift as part of the arithmetic.  */
+    ptrdiff_t disp = target - (pc + 1);
+    assert(disp == sextract32(disp, 0, 26));
+    return disp & 0x1ffffff;
+}
+
+static inline void reloc_pc26(tcg_insn_unit *pc, tcg_insn_unit *target)
+{
+    *pc = deposit32(*pc, 0, 26, reloc_pc16_val(pc, target));
+}
+
 static inline uint32_t reloc_26_val(tcg_insn_unit *pc, tcg_insn_unit *target)
 {
     assert((((uintptr_t)pc ^ (uintptr_t)target) & 0xf0000000) == 0);
@@ -166,9 +179,17 @@  static inline void reloc_26(tcg_insn_unit *pc, tcg_insn_unit *target)
 static void patch_reloc(tcg_insn_unit *code_ptr, int type,
                         intptr_t value, intptr_t addend)
 {
-    assert(type == R_MIPS_PC16);
     assert(addend == 0);
-    reloc_pc16(code_ptr, (tcg_insn_unit *)value);
+    switch (type) {
+    case R_MIPS_PC16:
+        reloc_pc16(code_ptr, (tcg_insn_unit *)value);
+	break;
+    case R_MIPS_PC26_S2:
+        reloc_pc26(code_ptr, (tcg_insn_unit *)value);
+	break;
+    default:
+        tcg_abort();
+    }
 }
 
 #define TCG_CT_CONST_ZERO 0x100
@@ -309,7 +330,10 @@  typedef enum {
     OPC_BEQ      = 004 << 26,
     OPC_BNE      = 005 << 26,
     OPC_BLEZ     = 006 << 26,
+    OPC_BGEUC    = OPC_BLEZ,    /* R6: rs != 0, rt != 0, rs != rt */
     OPC_BGTZ     = 007 << 26,
+    OPC_BLTUC    = OPC_BGTZ,    /* R6: rs != 0, rt != 0, rs != rt */
+    OPC_BEQC     = 010 << 26,   /* R6: rs > rt */
     OPC_ADDIU    = 011 << 26,
     OPC_SLTI     = 012 << 26,
     OPC_SLTIU    = 013 << 26,
@@ -318,6 +342,9 @@  typedef enum {
     OPC_XORI     = 016 << 26,
     OPC_LUI      = 017 << 26,
     OPC_AUI      = OPC_LUI,
+    OPC_BGEC     = 026 << 26,
+    OPC_BLTC     = 027 << 26,
+    OPC_BNEC     = 030 << 26,   /* R6: rs > rt */
     OPC_DADDIU   = 031 << 26,
     OPC_DAUI     = 035 << 26,
     OPC_LB       = 040 << 26,
@@ -329,6 +356,7 @@  typedef enum {
     OPC_SB       = 050 << 26,
     OPC_SH       = 051 << 26,
     OPC_SW       = 053 << 26,
+    OPC_BC       = 062 << 26,
     OPC_LD       = 067 << 26,
     OPC_SD       = 077 << 26,
 
@@ -527,6 +555,17 @@  static inline void tcg_out_opc_br(TCGContext *s, MIPSInsn opc,
     tcg_out_opc_imm(s, opc, rt, rs, offset);
 }
 
+static void tcg_out_opc_br_pc16(TCGContext *s, MIPSInsn opc,
+                                TCGReg arg1, TCGReg arg2, TCGLabel *l)
+{
+    tcg_out_opc_br(s, opc, arg1, arg2);
+    if (l->has_value) {
+        reloc_pc16(s->code_ptr - 1, l->u.value_ptr);
+    } else {
+        tcg_out_reloc(s, s->code_ptr - 1, R_MIPS_PC16, l, 0);
+    }
+}
+
 /*
  * Type sa
  */
@@ -1002,59 +1041,129 @@  static void tcg_out_brcond(TCGContext *s, TCGCond cond, TCGReg arg1,
         [TCG_COND_GE] = OPC_BGEZ,
     };
 
-    MIPSInsn s_opc = OPC_SLTU;
-    MIPSInsn b_opc;
-    int cmp_map;
+    MIPSInsn b_opc = 0;
+    bool compact = false;
+    int cmp_map, t;
+
+    /* We shouldn't expect to have arg1 == arg2, as the TCG optimizer
+       should have eliminated all such.  However, the R6 encodings do
+       not allow this situation, so e.g. if the optimizer is disabled
+       we must fall back to normal compares.  */
+    if (use_mips32r6_instructions && arg1 != arg2) {
+        switch (cond) {
+        case TCG_COND_EQ:
+        case TCG_COND_NE:
+            if (arg1 < arg2) {
+                t = arg1, arg1 = arg2, arg2 = t;
+            }
+            b_opc = cond == TCG_COND_EQ ? OPC_BEQC : OPC_BNEC;
+            compact = true;
+            break;
 
-    switch (cond) {
-    case TCG_COND_EQ:
-        b_opc = OPC_BEQ;
-        break;
-    case TCG_COND_NE:
-        b_opc = OPC_BNE;
-        break;
+        case TCG_COND_LE:
+        case TCG_COND_GT:
+            if (arg1 == TCG_REG_ZERO) {
+                break;
+            }
+            /* Swap arguments to turn LE to GE or GT to LT.
+               This also produces BLEZC/BGTZC when arg2 = 0.  */
+            t = arg1, arg1 = arg2, arg2 = t;
+            b_opc = cond == TCG_COND_LE ? OPC_BGEC : OPC_BLTC;
+            compact = true;
+            break;
 
-    case TCG_COND_LT:
-    case TCG_COND_GT:
-    case TCG_COND_LE:
-    case TCG_COND_GE:
-        if (arg2 == 0) {
-            b_opc = b_zero[cond];
-            arg2 = arg1;
-            arg1 = 0;
+        case TCG_COND_GE:
+        case TCG_COND_LT:
+            if (arg1 == TCG_REG_ZERO) {
+                break;
+            }
+            /* The encoding of BGEZC/BLTZC requires rs = rt.  */
+            if (arg2 == TCG_REG_ZERO) {
+                arg2 = arg1;
+            }
+            b_opc = cond == TCG_COND_GE ? OPC_BGEC : OPC_BLTC;
+            compact = true;
             break;
-        }
-        s_opc = OPC_SLT;
-        /* FALLTHRU */
 
-    case TCG_COND_LTU:
-    case TCG_COND_GTU:
-    case TCG_COND_LEU:
-    case TCG_COND_GEU:
-        cmp_map = mips_cmp_map[cond];
-        if (cmp_map & MIPS_CMP_SWAP) {
-            TCGReg t = arg1;
-            arg1 = arg2;
-            arg2 = t;
+        case TCG_COND_LEU:
+            /* Swap arguments to turn LE to GE.  */
+            t = arg1, arg1 = arg2, arg2 = t;
+            /* FALLTHRU */
+        case TCG_COND_GEU:
+            b_opc = OPC_BGEUC;
+            compact = true;
+            break;
+
+        case TCG_COND_GTU:
+            /* Swap arguments to turn GT to LT.  */
+            t = arg1, arg1 = arg2, arg2 = t;
+            /* FALLTHRU */
+        case TCG_COND_LTU:
+            b_opc = OPC_BLTUC;
+            compact = true;
+            break;
+
+        default:
+            tcg_abort();
+            break;
         }
-        tcg_out_opc_reg(s, s_opc, TCG_TMP0, arg1, arg2);
-        b_opc = (cmp_map & MIPS_CMP_INV ? OPC_BEQ : OPC_BNE);
-        arg1 = TCG_TMP0;
-        arg2 = TCG_REG_ZERO;
-        break;
+    }
 
-    default:
-        tcg_abort();
-        break;
+    if (b_opc == 0) {
+        MIPSInsn s_opc = OPC_SLTU;
+
+        switch (cond) {
+        case TCG_COND_EQ:
+            b_opc = OPC_BEQ;
+            break;
+        case TCG_COND_NE:
+            b_opc = OPC_BNE;
+            break;
+
+        case TCG_COND_LT:
+        case TCG_COND_GT:
+        case TCG_COND_LE:
+        case TCG_COND_GE:
+            if (arg2 == 0) {
+                b_opc = b_zero[cond];
+                arg2 = arg1;
+                arg1 = 0;
+                break;
+            }
+            s_opc = OPC_SLT;
+            /* FALLTHRU */
+
+        case TCG_COND_LTU:
+        case TCG_COND_GTU:
+        case TCG_COND_LEU:
+        case TCG_COND_GEU:
+            cmp_map = mips_cmp_map[cond];
+            if (cmp_map & MIPS_CMP_SWAP) {
+                TCGReg t = arg1;
+                arg1 = arg2;
+                arg2 = t;
+            }
+            tcg_out_opc_reg(s, s_opc, TCG_TMP0, arg1, arg2);
+            if (use_mips32r6_instructions) {
+                b_opc = (cmp_map & MIPS_CMP_INV ? OPC_BEQC : OPC_BNEC);
+                compact = true;
+            } else {
+                b_opc = (cmp_map & MIPS_CMP_INV ? OPC_BEQ : OPC_BNE);
+            }
+            arg1 = TCG_TMP0;
+            arg2 = TCG_REG_ZERO;
+            break;
+
+        default:
+            tcg_abort();
+            break;
+        }
     }
 
-    tcg_out_opc_br(s, b_opc, arg1, arg2);
-    if (l->has_value) {
-        reloc_pc16(s->code_ptr - 1, l->u.value_ptr);
-    } else {
-        tcg_out_reloc(s, s->code_ptr - 1, R_MIPS_PC16, l, 0);
+    tcg_out_opc_br_pc16(s, b_opc, arg1, arg2, l);
+    if (!compact) {
+        tcg_out_nop(s);
     }
-    tcg_out_nop(s);
 }
 
 static TCGReg tcg_out_reduce_eq2(TCGContext *s, TCGReg tmp0, TCGReg tmp1,
@@ -1826,8 +1935,19 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         s->tb_next_offset[a0] = tcg_current_code_size(s);
         break;
     case INDEX_op_br:
-        tcg_out_brcond(s, TCG_COND_EQ, TCG_REG_ZERO, TCG_REG_ZERO,
-                       arg_label(a0));
+        {
+            TCGLabel *l = arg_label(a0);
+            if (use_mips32r6_instructions) {
+                tcg_out32(s, OPC_BC);
+                if (l->has_value) {
+                    reloc_pc26(s->code_ptr - 1, l->u.value_ptr);
+                } else {
+                    tcg_out_reloc(s, s->code_ptr - 1, R_MIPS_PC26_S2, l, 0);
+                }
+            } else {
+                tcg_out_opc_br_pc16(s, OPC_BEQ, TCG_REG_ZERO, TCG_REG_ZERO, l);
+            }
+        }
         break;
 
     case INDEX_op_ld8u_i32: