[10/15] tcg-mips: Move bswap code to subroutines
diff mbox

Message ID 1455014403-10742-11-git-send-email-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson Feb. 9, 2016, 10:39 a.m. UTC
Without the mips32r2 / mips64r2 instructions to perform swapping,
32 and 64-bit bswap is quite large.  Move them to a subroutine in
the prologue block to minimize code bloat.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/mips/tcg-target.c | 389 ++++++++++++++++++++++++++++++++++----------------
 tcg/mips/tcg-target.h |   6 +-
 2 files changed, 271 insertions(+), 124 deletions(-)

Patch
diff mbox

diff --git a/tcg/mips/tcg-target.c b/tcg/mips/tcg-target.c
index b8c5d90..97f9251 100644
--- a/tcg/mips/tcg-target.c
+++ b/tcg/mips/tcg-target.c
@@ -135,6 +135,9 @@  static const TCGReg tcg_target_call_oarg_regs[2] = {
 };
 
 static tcg_insn_unit *tb_ret_addr;
+static tcg_insn_unit *bswap32s_addr;
+static tcg_insn_unit *bswap32u_addr;
+static tcg_insn_unit *bswap64_addr;
 
 static inline uint32_t reloc_pc16_val(tcg_insn_unit *pc, tcg_insn_unit *target)
 {
@@ -187,6 +190,7 @@  static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
     ct_str = *pct_str;
     switch(ct_str[0]) {
     case 'r':
+    do_default:
         ct->ct |= TCG_CT_REG;
         tcg_regset_set(ct->u.regs, 0xffffffff);
         break;
@@ -208,6 +212,7 @@  static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
     case 'S': /* qemu_st constraint */
         ct->ct |= TCG_CT_REG;
         tcg_regset_set(ct->u.regs, 0xffffffff);
+        tcg_regset_reset_reg(ct->u.regs, TCG_REG_V0);
         tcg_regset_reset_reg(ct->u.regs, TCG_REG_A0);
 #if defined(CONFIG_SOFTMMU)
         if (TCG_TARGET_REG_BITS < TARGET_LONG_BITS) {
@@ -218,6 +223,22 @@  static int target_parse_constraint(TCGArgConstraint *ct, const char **pct_str)
         }
 #endif
         break;
+    case 'v': /* bswap output constraint */
+        if (use_mips32r2_instructions) {
+            goto do_default;
+        }
+        ct->ct |= TCG_CT_REG;
+        tcg_regset_clear(ct->u.regs);
+        tcg_regset_set_reg(ct->u.regs, TCG_REG_V0);
+        break;
+    case 'a': /* bswap input constraint */
+        if (use_mips32r2_instructions) {
+            goto do_default;
+        }
+        ct->ct |= TCG_CT_REG;
+        tcg_regset_clear(ct->u.regs);
+        tcg_regset_set_reg(ct->u.regs, TCG_REG_A0);
+        break;
     case 'I':
         ct->ct |= TCG_CT_CONST_U16;
         break;
@@ -618,29 +639,23 @@  static inline void tcg_out_bswap16s(TCGContext *s, TCGReg ret, TCGReg arg)
     }
 }
 
+static void tcg_out_bswap_subr(TCGContext *s, tcg_insn_unit *sub)
+{
+    if (!tcg_out_opc_jmp(s, OPC_JAL, sub)) {
+        tcg_abort();
+    }
+}
+
 static inline void tcg_out_bswap32(TCGContext *s, TCGReg ret, TCGReg arg)
 {
     if (use_mips32r2_instructions) {
         tcg_out_opc_reg(s, OPC_WSBH, ret, 0, arg);
         tcg_out_opc_sa(s, OPC_ROTR, ret, ret, 16);
     } else {
-        /* ret and arg must be different and can't be register at */
-        if (ret == arg || ret == TCG_TMP0 || arg == TCG_TMP0) {
-            tcg_abort();
-        }
-
-        tcg_out_opc_sa(s, OPC_SLL, ret, arg, 24);
-
-        tcg_out_opc_sa(s, OPC_SRL, TCG_TMP0, arg, 24);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
-
-        tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, arg, 0xff00);
-        tcg_out_opc_sa(s, OPC_SLL, TCG_TMP0, TCG_TMP0, 8);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
-
-        tcg_out_opc_sa(s, OPC_SRL, TCG_TMP0, arg, 8);
-        tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, TCG_TMP0, 0xff00);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
+        assert(ret == TCG_REG_V0);
+        tcg_out_bswap_subr(s, bswap32s_addr);
+        /* delay slot */
+        tcg_out_opc_reg(s, OPC_OR, TCG_REG_A0, arg, TCG_REG_ZERO);
     }
 }
 
@@ -648,26 +663,13 @@  static inline void tcg_out_bswap32u(TCGContext *s, TCGReg ret, TCGReg arg)
 {
     if (use_mips32r2_instructions) {
         tcg_out_opc_reg(s, OPC_DSBH, ret, 0, arg);
-        tcg_out_opc_reg(s, OPC_DSHD, ret, 0, arg);
+        tcg_out_opc_reg(s, OPC_DSHD, ret, 0, ret);
         tcg_out_dsrl(s, ret, ret, 32);
     } else {
-        /* ret and arg must be different and can't be register at */
-        if (ret == arg || ret == TCG_TMP0 || arg == TCG_TMP0) {
-            tcg_abort();
-        }
-
-        tcg_out_dsll(s, ret, arg, 24);
-
-        tcg_out_dsrl(s, TCG_TMP0, arg, 24);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
-
-        tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, arg, 0xff00);
-        tcg_out_dsll(s, TCG_TMP0, TCG_TMP0, 8);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
-
-        tcg_out_dsrl(s, TCG_TMP0, arg, 8);
-        tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, TCG_TMP0, 0xff00);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
+        assert(ret == TCG_REG_V0);
+        tcg_out_bswap_subr(s, bswap32u_addr);
+        /* delay slot */
+        tcg_out_opc_reg(s, OPC_OR, TCG_REG_A0, arg, TCG_REG_ZERO);
     }
 }
 
@@ -677,44 +679,10 @@  static void tcg_out_bswap64(TCGContext *s, TCGReg ret, TCGReg arg)
         tcg_out_opc_reg(s, OPC_DSBH, ret, 0, arg);
         tcg_out_opc_reg(s, OPC_DSHD, ret, 0, arg);
     } else {
-        /* ret and arg must be different and can't be either tmp reg.  */
-        if (ret == arg || ret == TCG_TMP0 || arg == TCG_TMP0
-            || ret == TCG_TMP1 || arg == TCG_TMP1) {
-            tcg_abort();
-        }
-
-        /* ??? Consider just making this a subroutine.  */
-
-        /* A... ...H -> H... ...A */
-        tcg_out_dsll(s, ret, arg, 56);
-        tcg_out_dsrl(s, TCG_TMP0, arg, 56);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
-
-        /* .B.. ..G. -> .G.. ..B. */
-        tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, arg, 0xff00);
-        tcg_out_dsrl(s, TCG_TMP1, arg, 40);
-        tcg_out_dsll(s, TCG_TMP0, TCG_TMP0, 40);
-        tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP1, TCG_TMP1, 0xff00);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP1);
-
-        /* ..CD .... -> .... DC.. */
-        tcg_out_dsrl(s, TCG_TMP0, arg, 32);
-        tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP1, TCG_TMP0, 0xff00);
-        tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, TCG_TMP0, 0x00ff);
-        tcg_out_dsll(s, TCG_TMP1, TCG_TMP1, 8);
-        tcg_out_dsll(s, TCG_TMP0, TCG_TMP0, 24);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP1);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
-
-        /* .... EF.. -> ..FE .... */
-        tcg_out_dsrl(s, TCG_TMP0, arg, 16);
-        tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP1, TCG_TMP0, 0xff00);
-        tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, TCG_TMP0, 0x00ff);
-        tcg_out_dsll(s, TCG_TMP1, TCG_TMP1, 24);
-        tcg_out_dsll(s, TCG_TMP0, TCG_TMP0, 40);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP1);
-        tcg_out_opc_reg(s, OPC_OR, ret, ret, TCG_TMP0);
+        assert(ret == TCG_REG_V0);
+        tcg_out_bswap_subr(s, bswap64_addr);
+        /* delay slot */
+        tcg_out_opc_reg(s, OPC_OR, TCG_REG_A0, arg, TCG_REG_ZERO);
     }
 }
 
@@ -1425,72 +1393,111 @@  static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
 }
 #endif
 
-static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
+static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg lo, TCGReg hi,
                                    TCGReg base, TCGMemOp opc, bool is_64)
 {
+    bool hi_first = MIPS_BE ? hi != base : lo == base;
+
     switch (opc & (MO_SSIZE | MO_BSWAP)) {
     case MO_UB:
-        tcg_out_opc_imm(s, OPC_LBU, datalo, base, 0);
+        tcg_out_opc_imm(s, OPC_LBU, lo, base, 0);
         break;
     case MO_SB:
-        tcg_out_opc_imm(s, OPC_LB, datalo, base, 0);
+        tcg_out_opc_imm(s, OPC_LB, lo, base, 0);
         break;
 
     case MO_UW | MO_BSWAP:
         tcg_out_opc_imm(s, OPC_LHU, TCG_TMP1, base, 0);
-        tcg_out_bswap16(s, datalo, TCG_TMP1);
+        tcg_out_bswap16(s, lo, TCG_TMP1);
         break;
     case MO_UW:
-        tcg_out_opc_imm(s, OPC_LHU, datalo, base, 0);
+        tcg_out_opc_imm(s, OPC_LHU, lo, base, 0);
         break;
 
     case MO_SW | MO_BSWAP:
         tcg_out_opc_imm(s, OPC_LHU, TCG_TMP1, base, 0);
-        tcg_out_bswap16s(s, datalo, TCG_TMP1);
+        tcg_out_bswap16s(s, lo, TCG_TMP1);
         break;
     case MO_SW:
-        tcg_out_opc_imm(s, OPC_LH, datalo, base, 0);
+        tcg_out_opc_imm(s, OPC_LH, lo, base, 0);
         break;
 
     case MO_UL | MO_BSWAP:
         if (TCG_TARGET_REG_BITS == 64 && is_64) {
-            tcg_out_opc_imm(s, OPC_LWU, TCG_TMP1, base, 0);
-            tcg_out_bswap32u(s, datalo, TCG_TMP1);
+            if (use_mips32r2_instructions) {
+                tcg_out_opc_imm(s, OPC_LWU, lo, base, 0);
+                tcg_out_bswap32u(s, lo, lo);
+            } else {
+                tcg_out_bswap_subr(s, bswap32u_addr);
+                /* delay slot */
+                tcg_out_opc_imm(s, OPC_LWU, TCG_REG_A0, base, 0);
+                tcg_out_mov(s, TCG_TYPE_I64, lo, TCG_REG_V0);
+            }
             break;
         }
         /* FALLTHRU */
     case MO_SL | MO_BSWAP:
-        tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, 0);
-        tcg_out_bswap32(s, datalo, TCG_TMP1);
+        if (use_mips32r2_instructions) {
+            tcg_out_opc_imm(s, OPC_LW, lo, base, 0);
+            tcg_out_bswap32(s, lo, lo);
+        } else {
+            tcg_out_bswap_subr(s, bswap32s_addr);
+            /* delay slot */
+            tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, base, 0);
+            tcg_out_mov(s, TCG_TYPE_I32, lo, TCG_REG_V0);
+        }
         break;
 
     case MO_UL:
         if (TCG_TARGET_REG_BITS == 64 && is_64) {
-            tcg_out_opc_imm(s, OPC_LWU, datalo, base, 0);
+            tcg_out_opc_imm(s, OPC_LWU, lo, base, 0);
             break;
         }
         /* FALLTHRU */
     case MO_SL:
-        tcg_out_opc_imm(s, OPC_LW, datalo, base, 0);
+        tcg_out_opc_imm(s, OPC_LW, lo, base, 0);
         break;
 
     case MO_Q | MO_BSWAP:
-        if (TCG_TARGET_REG_BITS == 32) {
-            tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, HI_OFF);
-            tcg_out_bswap32(s, datalo, TCG_TMP1);
-            tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, LO_OFF);
-            tcg_out_bswap32(s, datahi, TCG_TMP1);
+        if (TCG_TARGET_REG_BITS == 64 && use_mips32r2_instructions) {
+            tcg_out_opc_imm(s, OPC_LD, lo, base, 0);
+            tcg_out_bswap64(s, lo, lo);
+        } else if (TCG_TARGET_REG_BITS == 64) {
+            tcg_out_bswap_subr(s, bswap64_addr);
+            /* delay slot */
+            tcg_out_opc_imm(s, OPC_LD, TCG_REG_A0, base, 0);
+            tcg_out_mov(s, TCG_TYPE_I64, lo, TCG_REG_V0);
+        } else if (use_mips32r2_instructions) {
+            tcg_out_opc_imm(s, OPC_LW, TCG_TMP0, base, 0);
+            tcg_out_opc_imm(s, OPC_LW, TCG_TMP1, base, 4);
+            tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP0, 0, TCG_TMP0);
+            tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP1, 0, TCG_TMP1);
+            tcg_out_opc_sa(s, OPC_ROTR, MIPS_BE ? lo : hi, TCG_TMP0, 16);
+            tcg_out_opc_sa(s, OPC_ROTR, MIPS_BE ? hi : lo, TCG_TMP1, 16);
         } else {
-            tcg_out_opc_imm(s, OPC_LD, TCG_REG_V0, base, 0);
-            tcg_out_bswap64(s, datalo, TCG_REG_V0);
+            tcg_out_bswap_subr(s, bswap32s_addr);
+            /* delay slot */
+            tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, base,
+                            hi_first ? LO_OFF : HI_OFF);
+            tcg_out_mov(s, TCG_TYPE_I32, TCG_REG_A2, TCG_REG_V0);
+
+            tcg_out_bswap_subr(s, bswap32s_addr);
+            /* delay slot */
+            tcg_out_opc_imm(s, OPC_LW, TCG_REG_A0, base,
+                            hi_first ? LO_OFF : HI_OFF);
+            tcg_out_mov(s, TCG_TYPE_I32, hi_first ? lo : hi, TCG_REG_V0);
+            tcg_out_mov(s, TCG_TYPE_I32, hi_first ? hi : lo, TCG_REG_A2);
         }
         break;
     case MO_Q:
-        if (TCG_TARGET_REG_BITS == 32) {
-            tcg_out_opc_imm(s, OPC_LW, datalo, base, LO_OFF);
-            tcg_out_opc_imm(s, OPC_LW, datahi, base, HI_OFF);
+        if (TCG_TARGET_REG_BITS == 64) {
+            tcg_out_opc_imm(s, OPC_LD, lo, base, 0);
+        } else if (hi_first) {
+            tcg_out_opc_imm(s, OPC_LW, hi, base, HI_OFF);
+            tcg_out_opc_imm(s, OPC_LW, lo, base, LO_OFF);
         } else {
-            tcg_out_opc_imm(s, OPC_LD, datalo, base, 0);
+            tcg_out_opc_imm(s, OPC_LW, lo, base, LO_OFF);
+            tcg_out_opc_imm(s, OPC_LW, hi, base, HI_OFF);
         }
         break;
     default:
@@ -1540,54 +1547,62 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is_64)
 #endif
 }
 
-static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
+static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg lo, TCGReg hi,
                                    TCGReg base, TCGMemOp opc)
 {
-    if ((datalo | datahi) == 0) {
+    /* Don't clutter the code below with checks to avoid bswapping ZERO.  */
+    if ((lo | hi) == 0) {
         opc &= ~MO_BSWAP;
     }
 
     switch (opc & (MO_SIZE | MO_BSWAP)) {
     case MO_8:
-        tcg_out_opc_imm(s, OPC_SB, datalo, base, 0);
+        tcg_out_opc_imm(s, OPC_SB, lo, base, 0);
         break;
 
     case MO_16 | MO_BSWAP:
-        tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP1, datalo, 0xffff);
+        tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP1, lo, 0xffff);
         tcg_out_bswap16(s, TCG_TMP1, TCG_TMP1);
-        datalo = TCG_TMP1;
+        lo = TCG_TMP1;
         /* FALLTHRU */
     case MO_16:
-        tcg_out_opc_imm(s, OPC_SH, datalo, base, 0);
+        tcg_out_opc_imm(s, OPC_SH, lo, base, 0);
         break;
 
     case MO_32 | MO_BSWAP:
-        tcg_out_bswap32(s, TCG_TMP1, datalo);
-        datalo = TCG_TMP1;
+        tcg_out_bswap32(s, TCG_REG_V0, lo);
+        lo = TCG_REG_V0;
         /* FALLTHRU */
     case MO_32:
-        tcg_out_opc_imm(s, OPC_SW, datalo, base, 0);
+        tcg_out_opc_imm(s, OPC_SW, lo, base, 0);
         break;
 
     case MO_64 | MO_BSWAP:
-        if (TCG_TARGET_REG_BITS == 32) {
-            tcg_out_bswap32(s, TCG_TMP1, datalo);
-            datalo = TCG_TMP1;
-            tcg_out_opc_imm(s, OPC_SW, datalo, base, HI_OFF);
-            tcg_out_bswap32(s, TCG_TMP1, datahi);
-            datahi = TCG_TMP1;
-            tcg_out_opc_imm(s, OPC_SW, datahi, base, LO_OFF);
+        if (TCG_TARGET_REG_BITS == 64) {
+            tcg_out_bswap64(s, TCG_REG_V0, lo);
+            lo = TCG_REG_V0;
+        } else if (use_mips32r2_instructions) {
+            tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP0, 0, MIPS_BE ? lo : hi);
+            tcg_out_opc_reg(s, OPC_WSBH, TCG_TMP1, 0, MIPS_BE ? hi : lo);
+            tcg_out_opc_sa(s, OPC_ROTR, TCG_TMP0, TCG_TMP0, 16);
+            tcg_out_opc_sa(s, OPC_ROTR, TCG_TMP1, TCG_TMP1, 16);
+            tcg_out_opc_imm(s, OPC_SW, TCG_TMP0, base, 0);
+            tcg_out_opc_imm(s, OPC_SW, TCG_TMP1, base, 4);
+            break;
+        } else {
+            tcg_out_bswap32(s, TCG_REG_V0, lo);
+            tcg_out_opc_imm(s, OPC_SW, TCG_REG_V0, base, HI_OFF);
+            tcg_out_bswap32(s, TCG_REG_V0, hi);
+            tcg_out_opc_imm(s, OPC_SW, TCG_REG_V0, base, LO_OFF);
             break;
         }
-        tcg_out_bswap64(s, TCG_REG_A1, datalo);
-        datalo = TCG_REG_A1;
         /* FALLTHRU */
     case MO_64:
         if (TCG_TARGET_REG_BITS == 32) {
-            tcg_out_opc_imm(s, OPC_SW, datalo, base, LO_OFF);
-            tcg_out_opc_imm(s, OPC_SW, datahi, base, HI_OFF);
+            tcg_out_opc_imm(s, OPC_SW, MIPS_BE ? hi : lo, base, 0);
+            tcg_out_opc_imm(s, OPC_SW, MIPS_BE ? lo : hi, base, 4);
         } else {
-            tcg_out_opc_imm(s, OPC_SD, datalo, base, 0);
+            tcg_out_opc_imm(s, OPC_SD, lo, base, 0);
         }
         break;
 
@@ -2117,7 +2132,7 @@  static const TCGTargetOpDef mips_op_defs[] = {
     { INDEX_op_rotl_i32, { "r", "rZ", "ri" } },
 
     { INDEX_op_bswap16_i32, { "r", "r" } },
-    { INDEX_op_bswap32_i32, { "r", "r" } },
+    { INDEX_op_bswap32_i32, { "v", "a" } },
 
     { INDEX_op_ext8s_i32, { "r", "rZ" } },
     { INDEX_op_ext16s_i32, { "r", "rZ" } },
@@ -2179,8 +2194,8 @@  static const TCGTargetOpDef mips_op_defs[] = {
     { INDEX_op_rotl_i64, { "r", "rZ", "ri" } },
 
     { INDEX_op_bswap16_i64, { "r", "r" } },
-    { INDEX_op_bswap32_i64, { "r", "r" } },
-    { INDEX_op_bswap64_i64, { "r", "r" } },
+    { INDEX_op_bswap32_i64, { "v", "a" } },
+    { INDEX_op_bswap64_i64, { "v", "a" } },
 
     { INDEX_op_ext8s_i64, { "r", "rZ" } },
     { INDEX_op_ext16s_i64, { "r", "rZ" } },
@@ -2324,6 +2339,16 @@  static void tcg_target_detect_isa(void)
 /* We're expecting to be able to use an immediate for frame allocation.  */
 QEMU_BUILD_BUG_ON(FRAME_SIZE > 0x7fff);
 
+static tcg_insn_unit *align_code_ptr(TCGContext *s)
+{
+    uintptr_t p = (uintptr_t)s->code_ptr;
+    if (p & 15) {
+        p = (p + 15) & -16;
+        s->code_ptr = (void *)p;
+    }
+    return s->code_ptr;
+}
+
 /* Generate global QEMU prologue and epilogue code */
 static void tcg_target_qemu_prologue(TCGContext *s)
 {
@@ -2353,6 +2378,128 @@  static void tcg_target_qemu_prologue(TCGContext *s)
     tcg_out_opc_reg(s, OPC_JR, 0, TCG_REG_RA, 0);
     /* delay slot */
     tcg_out_opc_imm(s, ALIAS_PADDI, TCG_REG_SP, TCG_REG_SP, FRAME_SIZE);
+
+    if (use_mips32r2_instructions) {
+        return;
+    }
+
+    /* Bswap subroutines: Input in TCG_REG_A0, output in TCG_REG_V0;
+       clobbers TCG_TMP1, TCG_TMP0.  */
+
+    bswap32s_addr = align_code_ptr(s);
+
+    /*
+     * bswap32s -- signed 32-bit swap.  a0 = abcd.
+     */
+    /* v0 = (ssss)d000 */
+    tcg_out_opc_sa(s, OPC_SLL, TCG_REG_V0, TCG_REG_A0, 24);
+    /* t1 = 000a */
+    tcg_out_opc_sa(s, OPC_SRL, TCG_TMP1, TCG_REG_A0, 24);
+    /* t0 = 00c0 */
+    tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, TCG_REG_A0, 0xff00);
+    /* v0 = d00a */
+    tcg_out_opc_reg(s, OPC_OR, TCG_REG_V0, TCG_REG_V0, TCG_TMP1);
+    /* t1 = 0abc */
+    tcg_out_opc_sa(s, OPC_SRL, TCG_TMP1, TCG_REG_A0, 8);
+    /* t0 = 0c00 */
+    tcg_out_opc_sa(s, OPC_SLL, TCG_TMP0, TCG_TMP0, 8);
+    /* t1 = 00b0 */
+    tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP1, TCG_TMP1, 0xff00);
+    /* v0 = dc0a */
+    tcg_out_opc_reg(s, OPC_OR, TCG_REG_V0, TCG_REG_V0, TCG_TMP0);
+    tcg_out_opc_reg(s, OPC_JR, 0, TCG_REG_RA, 0);
+    /* v0 = dcba -- delay slot */
+    tcg_out_opc_reg(s, OPC_OR, TCG_REG_V0, TCG_REG_V0, TCG_TMP1);
+
+    if (TCG_TARGET_REG_BITS == 32) {
+        return;
+    }
+
+    bswap32u_addr = align_code_ptr(s);
+
+    /*
+     * bswap32u -- unsigned 32-bit swap.  a0 = ....abcd.
+     */
+    /* t1 = (0000)000d */
+    tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP1, TCG_REG_A0, 0xff);
+    /* v0 = 000a */
+    tcg_out_opc_sa(s, OPC_SRL, TCG_REG_V0, TCG_REG_A0, 24);
+    /* t1 = (0000)d000 */
+    tcg_out_dsll(s, TCG_TMP1, TCG_TMP1, 24);
+    /* t0 = 00c0 */
+    tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, TCG_REG_A0, 0xff00);
+    /* v0 = d00a */
+    tcg_out_opc_reg(s, OPC_OR, TCG_REG_V0, TCG_REG_V0, TCG_TMP1);
+    /* t1 = 0abc */
+    tcg_out_opc_sa(s, OPC_SRL, TCG_TMP1, TCG_REG_A0, 8);
+    /* t0 = 0c00 */
+    tcg_out_opc_sa(s, OPC_SLL, TCG_TMP0, TCG_TMP0, 8);
+    /* t1 = 00b0 */
+    tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP1, TCG_TMP1, 0xff00);
+    /* v0 = dc0a */
+    tcg_out_opc_reg(s, OPC_OR, TCG_REG_V0, TCG_REG_V0, TCG_TMP0);
+    tcg_out_opc_reg(s, OPC_JR, 0, TCG_REG_RA, 0);
+    /* v0 = dcba -- delay slot */
+    tcg_out_opc_reg(s, OPC_OR, TCG_REG_V0, TCG_REG_V0, TCG_TMP1);
+
+    bswap64_addr = align_code_ptr(s);
+
+    /*
+     * bswap64 -- 64-bit swap.  a0 = abcdefgh
+     */
+    /* v0 = h0000000 */
+    tcg_out_dsll(s, TCG_REG_V0, TCG_REG_A0, 56);
+    /* t1 = 0000000a */
+    tcg_out_dsrl(s, TCG_TMP1, TCG_REG_A0, 56);
+
+    /* t0 = 000000g0 */
+    tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, TCG_REG_A0, 0xff00);
+    /* v0 = h000000a */
+    tcg_out_opc_reg(s, OPC_OR, TCG_REG_V0, TCG_REG_V0, TCG_TMP1);
+    /* t1 = 00000abc */
+    tcg_out_dsrl(s, TCG_TMP1, TCG_REG_A0, 40);
+    /* t0 = 0g000000 */
+    tcg_out_dsll(s, TCG_TMP0, TCG_TMP0, 40);
+    /* t1 = 000000b0 */
+    tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP1, TCG_TMP1, 0xff00);
+
+    /* v0 = hg00000a */
+    tcg_out_opc_reg(s, OPC_OR, TCG_REG_V0, TCG_REG_V0, TCG_TMP0);
+    /* t0 = 0000abcd */
+    tcg_out_dsrl(s, TCG_TMP0, TCG_REG_A0, 32);
+    /* v0 = hg0000ba */
+    tcg_out_opc_reg(s, OPC_OR, TCG_REG_V0, TCG_REG_V0, TCG_TMP1);
+
+    /* t1 = 000000c0 */
+    tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP1, TCG_TMP0, 0xff00);
+    /* t0 = 0000000d */
+    tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, TCG_TMP0, 0x00ff);
+    /* t1 = 00000c00 */
+    tcg_out_dsll(s, TCG_TMP1, TCG_TMP1, 8);
+    /* t0 = 0000d000 */
+    tcg_out_dsll(s, TCG_TMP0, TCG_TMP0, 24);
+
+    /* v0 = hg000cba */
+    tcg_out_opc_reg(s, OPC_OR, TCG_REG_V0, TCG_REG_V0, TCG_TMP1);
+    /* t1 = 00abcdef */
+    tcg_out_dsrl(s, TCG_TMP1, TCG_REG_A0, 16);
+    /* v0 = hg00dcba */
+    tcg_out_opc_reg(s, OPC_OR, TCG_REG_V0, TCG_REG_V0, TCG_TMP0);
+
+    /* t0 = 0000000f */
+    tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP0, TCG_TMP1, 0x00ff);
+    /* t1 = 000000e0 */
+    tcg_out_opc_imm(s, OPC_ANDI, TCG_TMP1, TCG_TMP1, 0xff00);
+    /* t0 = 00f00000 */
+    tcg_out_dsll(s, TCG_TMP0, TCG_TMP0, 40);
+    /* t1 = 000e0000 */
+    tcg_out_dsll(s, TCG_TMP1, TCG_TMP1, 24);
+
+    /* v0 = hgf0dcba */
+    tcg_out_opc_reg(s, OPC_OR, TCG_REG_V0, TCG_REG_V0, TCG_TMP0);
+    tcg_out_opc_reg(s, OPC_JR, 0, TCG_REG_RA, 0);
+    /* v0 = hgfedcba -- delay slot */
+    tcg_out_opc_reg(s, OPC_OR, TCG_REG_V0, TCG_REG_V0, TCG_TMP1);
 }
 
 static void tcg_target_init(TCGContext *s)
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index 0dab62b..374d803 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -128,6 +128,7 @@  extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_muls2_i32        (!use_mips32r6_instructions)
 #define TCG_TARGET_HAS_muluh_i32        1
 #define TCG_TARGET_HAS_mulsh_i32        1
+#define TCG_TARGET_HAS_bswap32_i32      1
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_add2_i32         0
@@ -150,12 +151,13 @@  extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_mulsh_i64        1
 #define TCG_TARGET_HAS_ext32s_i64       1
 #define TCG_TARGET_HAS_ext32u_i64       1
+#define TCG_TARGET_HAS_bswap32_i64      1
+#define TCG_TARGET_HAS_bswap64_i64      1
 #endif
 
 /* optional instructions detected at runtime */
 #define TCG_TARGET_HAS_movcond_i32      use_movnz_instructions
 #define TCG_TARGET_HAS_bswap16_i32      use_mips32r2_instructions
-#define TCG_TARGET_HAS_bswap32_i32      use_mips32r2_instructions
 #define TCG_TARGET_HAS_deposit_i32      use_mips32r2_instructions
 #define TCG_TARGET_HAS_ext8s_i32        use_mips32r2_instructions
 #define TCG_TARGET_HAS_ext16s_i32       use_mips32r2_instructions
@@ -164,8 +166,6 @@  extern bool use_mips32r2_instructions;
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_movcond_i64      use_movnz_instructions
 #define TCG_TARGET_HAS_bswap16_i64      use_mips32r2_instructions
-#define TCG_TARGET_HAS_bswap32_i64      use_mips32r2_instructions
-#define TCG_TARGET_HAS_bswap64_i64      use_mips32r2_instructions
 #define TCG_TARGET_HAS_deposit_i64      use_mips32r2_instructions
 #define TCG_TARGET_HAS_ext8s_i64        use_mips32r2_instructions
 #define TCG_TARGET_HAS_ext16s_i64       use_mips32r2_instructions