diff mbox series

[v2,11/21] Hexagon (target/hexagon) Short-circuit packet register writes

Message ID 20230427230012.3800327-12-tsimpson@quicinc.com (mailing list archive)
State New, archived
Headers show
Series Hexagon (target/hexagon) short-circuit and move to DisasContext | expand

Commit Message

Taylor Simpson April 27, 2023, 11 p.m. UTC
In certain cases, we can avoid the overhead of writing to hex_new_value
and write directly to hex_gpr.  We add need_commit field to DisasContext
indicating if the end-of-packet commit is needed.  If it is not needed,
get_result_gpr() and get_result_gpr_pair() can return hex_gpr.

We pass the ctx->need_commit to helpers when needed.

Finally, we can early-exit from gen_reg_writes during packet commit.

There are a few instructions whose semantics write to the result before
reading all the inputs.  Therefore, the idef-parser generated code is
incompatible with short-circuit.  We tell idef-parser to skip them.

For debugging purposes, we add a cpu property to turn off short-circuit.
When the short-circuit property is false, we skip the analysis and force
the end-of-packet commit.

Here's a simple example of the TCG generated for
0x004000b4:  0x7800c020 {       R0 = #0x1 }

BEFORE:
 ---- 004000b4
 movi_i32 new_r0,$0x1
 mov_i32 r0,new_r0

AFTER:
 ---- 004000b4
 movi_i32 r0,$0x1

This patch reintroduces a use of check_for_attrib, so we remove the
G_GNUC_UNUSED added earlier in this series.

Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/hexagon/cpu.h                    |  1 +
 target/hexagon/gen_tcg.h                |  3 +-
 target/hexagon/genptr.h                 |  2 +
 target/hexagon/helper.h                 |  2 +-
 target/hexagon/macros.h                 | 13 ++++-
 target/hexagon/translate.h              |  2 +
 target/hexagon/arch.c                   |  3 +-
 target/hexagon/cpu.c                    |  5 +-
 target/hexagon/genptr.c                 | 30 ++++-------
 target/hexagon/op_helper.c              |  5 +-
 target/hexagon/translate.c              | 67 ++++++++++++++++++++++++-
 target/hexagon/gen_helper_funcs.py      |  2 +
 target/hexagon/gen_helper_protos.py     | 10 +++-
 target/hexagon/gen_idef_parser_funcs.py |  7 +++
 target/hexagon/gen_tcg_funcs.py         |  5 ++
 target/hexagon/hex_common.py            |  3 ++
 16 files changed, 129 insertions(+), 31 deletions(-)

Comments

Brian Cain April 28, 2023, 2:47 a.m. UTC | #1
> -----Original Message-----
> From: Taylor Simpson <tsimpson@quicinc.com>
> Sent: Thursday, April 27, 2023 6:00 PM
> To: qemu-devel@nongnu.org
> Cc: Taylor Simpson <tsimpson@quicinc.com>; richard.henderson@linaro.org;
> philmd@linaro.org; ale@rev.ng; anjo@rev.ng; Brian Cain
> <bcain@quicinc.com>; Matheus Bernardino (QUIC)
> <quic_mathbern@quicinc.com>
> Subject: [PATCH v2 11/21] Hexagon (target/hexagon) Short-circuit packet
> register writes
> 
> In certain cases, we can avoid the overhead of writing to hex_new_value
> and write directly to hex_gpr.  We add need_commit field to DisasContext
> indicating if the end-of-packet commit is needed.  If it is not needed,
> get_result_gpr() and get_result_gpr_pair() can return hex_gpr.
> 
> We pass the ctx->need_commit to helpers when needed.
> 
> Finally, we can early-exit from gen_reg_writes during packet commit.
> 
> There are a few instructions whose semantics write to the result before
> reading all the inputs.  Therefore, the idef-parser generated code is
> incompatible with short-circuit.  We tell idef-parser to skip them.
> 
> For debugging purposes, we add a cpu property to turn off short-circuit.
> When the short-circuit property is false, we skip the analysis and force
> the end-of-packet commit.
> 
> Here's a simple example of the TCG generated for
> 0x004000b4:  0x7800c020 {       R0 = #0x1 }
> 
> BEFORE:
>  ---- 004000b4
>  movi_i32 new_r0,$0x1
>  mov_i32 r0,new_r0
> 
> AFTER:
>  ---- 004000b4
>  movi_i32 r0,$0x1
> 
> This patch reintroduces a use of check_for_attrib, so we remove the
> G_GNUC_UNUSED added earlier in this series.
> 
> Signed-off-by: Taylor Simpson <tsimpson@quicinc.com>
> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/hexagon/cpu.h                    |  1 +
>  target/hexagon/gen_tcg.h                |  3 +-
>  target/hexagon/genptr.h                 |  2 +
>  target/hexagon/helper.h                 |  2 +-
>  target/hexagon/macros.h                 | 13 ++++-
>  target/hexagon/translate.h              |  2 +
>  target/hexagon/arch.c                   |  3 +-
>  target/hexagon/cpu.c                    |  5 +-
>  target/hexagon/genptr.c                 | 30 ++++-------
>  target/hexagon/op_helper.c              |  5 +-
>  target/hexagon/translate.c              | 67 ++++++++++++++++++++++++-
>  target/hexagon/gen_helper_funcs.py      |  2 +
>  target/hexagon/gen_helper_protos.py     | 10 +++-
>  target/hexagon/gen_idef_parser_funcs.py |  7 +++
>  target/hexagon/gen_tcg_funcs.py         |  5 ++
>  target/hexagon/hex_common.py            |  3 ++
>  16 files changed, 129 insertions(+), 31 deletions(-)
> 
> diff --git a/target/hexagon/cpu.h b/target/hexagon/cpu.h
> index 81b663ecfb..9252055a38 100644
> --- a/target/hexagon/cpu.h
> +++ b/target/hexagon/cpu.h
> @@ -146,6 +146,7 @@ struct ArchCPU {
> 
>      bool lldb_compat;
>      target_ulong lldb_stack_adjust;
> +    bool short_circuit;
>  };
> 
>  #include "cpu_bits.h"
> diff --git a/target/hexagon/gen_tcg.h b/target/hexagon/gen_tcg.h
> index 2b2a6175a5..1f7e535300 100644
> --- a/target/hexagon/gen_tcg.h
> +++ b/target/hexagon/gen_tcg.h
> @@ -592,7 +592,8 @@
>  #define fGEN_TCG_A5_ACS(SHORTCODE) \
>      do { \
>          gen_helper_vacsh_pred(PeV, cpu_env, RxxV, RssV, RttV); \
> -        gen_helper_vacsh_val(RxxV, cpu_env, RxxV, RssV, RttV); \
> +        gen_helper_vacsh_val(RxxV, cpu_env, RxxV, RssV, RttV, \
> +                             tcg_constant_tl(ctx->need_commit)); \
>      } while (0)
> 
>  #define fGEN_TCG_S2_cabacdecbin(SHORTCODE) \
> diff --git a/target/hexagon/genptr.h b/target/hexagon/genptr.h
> index 75d0fc262d..420867f934 100644
> --- a/target/hexagon/genptr.h
> +++ b/target/hexagon/genptr.h
> @@ -58,4 +58,6 @@ void gen_set_half(int N, TCGv result, TCGv src);
>  void gen_set_half_i64(int N, TCGv_i64 result, TCGv src);
>  void probe_noshuf_load(TCGv va, int s, int mi);
> 
> +extern const target_ulong reg_immut_masks[TOTAL_PER_THREAD_REGS];
> +
>  #endif
> diff --git a/target/hexagon/helper.h b/target/hexagon/helper.h
> index 73849e3d49..4b750d0351 100644
> --- a/target/hexagon/helper.h
> +++ b/target/hexagon/helper.h
> @@ -29,7 +29,7 @@ DEF_HELPER_FLAGS_4(fcircadd, TCG_CALL_NO_RWG_SE,
> s32, s32, s32, s32, s32)
>  DEF_HELPER_FLAGS_1(fbrev, TCG_CALL_NO_RWG_SE, i32, i32)
>  DEF_HELPER_3(sfrecipa, i64, env, f32, f32)
>  DEF_HELPER_2(sfinvsqrta, i64, env, f32)
> -DEF_HELPER_4(vacsh_val, s64, env, s64, s64, s64)
> +DEF_HELPER_5(vacsh_val, s64, env, s64, s64, s64, i32)
>  DEF_HELPER_FLAGS_4(vacsh_pred, TCG_CALL_NO_RWG_SE, s32, env, s64,
> s64, s64)
>  DEF_HELPER_FLAGS_2(cabacdecbin_val, TCG_CALL_NO_RWG_SE, s64, s64,
> s64)
>  DEF_HELPER_FLAGS_2(cabacdecbin_pred, TCG_CALL_NO_RWG_SE, s32, s64,
> s64)
> diff --git a/target/hexagon/macros.h b/target/hexagon/macros.h
> index 16e72ed0d5..a68446a367 100644
> --- a/target/hexagon/macros.h
> +++ b/target/hexagon/macros.h
> @@ -44,8 +44,17 @@
>                     reg_field_info[FIELD].offset)
> 
>  #define SET_USR_FIELD(FIELD, VAL) \
> -    fINSERT_BITS(env->new_value[HEX_REG_USR], reg_field_info[FIELD].width,
> \
> -                 reg_field_info[FIELD].offset, (VAL))
> +    do { \
> +        if (pkt_need_commit) { \
> +            fINSERT_BITS(env->new_value[HEX_REG_USR], \
> +                        reg_field_info[FIELD].width, \
> +                        reg_field_info[FIELD].offset, (VAL)); \
> +        } else { \
> +            fINSERT_BITS(env->gpr[HEX_REG_USR], \
> +                        reg_field_info[FIELD].width, \
> +                        reg_field_info[FIELD].offset, (VAL)); \
> +        } \
> +    } while (0)
>  #endif
> 
>  #ifdef QEMU_GENERATE
> diff --git a/target/hexagon/translate.h b/target/hexagon/translate.h
> index f72228859f..3f6fd3452c 100644
> --- a/target/hexagon/translate.h
> +++ b/target/hexagon/translate.h
> @@ -62,10 +62,12 @@ typedef struct DisasContext {
>      int qreg_log_idx;
>      DECLARE_BITMAP(qregs_read, NUM_QREGS);
>      bool pre_commit;
> +    bool need_commit;
>      TCGCond branch_cond;
>      target_ulong branch_dest;
>      bool is_tight_loop;
>      bool need_pkt_has_store_s1;
> +    bool short_circuit;
>  } DisasContext;
> 
>  static inline void ctx_log_pred_write(DisasContext *ctx, int pnum)
> diff --git a/target/hexagon/arch.c b/target/hexagon/arch.c
> index da79b41c4d..d053d68487 100644
> --- a/target/hexagon/arch.c
> +++ b/target/hexagon/arch.c
> @@ -1,5 +1,5 @@
>  /*
> - *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
> + *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
>   *
>   *  This program is free software; you can redistribute it and/or modify
>   *  it under the terms of the GNU General Public License as published by
> @@ -224,6 +224,7 @@ void arch_fpop_start(CPUHexagonState *env)
> 
>  void arch_fpop_end(CPUHexagonState *env)
>  {
> +    const bool pkt_need_commit = true;
>      int flags = get_float_exception_flags(&env->fp_status);
>      if (flags != 0) {
>          SOFTFLOAT_TEST_FLAG(float_flag_inexact, FPINPF, FPINPE);
> diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c
> index ab40cfc283..4adf90dcfa 100644
> --- a/target/hexagon/cpu.c
> +++ b/target/hexagon/cpu.c
> @@ -1,5 +1,5 @@
>  /*
> - *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
> + *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights
> Reserved.
>   *
>   *  This program is free software; you can redistribute it and/or modify
>   *  it under the terms of the GNU General Public License as published by
> @@ -52,6 +52,8 @@ static Property hexagon_lldb_compat_property =
>  static Property hexagon_lldb_stack_adjust_property =
>      DEFINE_PROP_UNSIGNED("lldb-stack-adjust", HexagonCPU,
> lldb_stack_adjust,
>                           0, qdev_prop_uint32, target_ulong);
> +static Property hexagon_short_circuit_property =
> +    DEFINE_PROP_BOOL("short-circuit", HexagonCPU, short_circuit, true);
> 
>  const char * const hexagon_regnames[TOTAL_PER_THREAD_REGS] = {
>     "r0", "r1",  "r2",  "r3",  "r4",   "r5",  "r6",  "r7",
> @@ -328,6 +330,7 @@ static void hexagon_cpu_init(Object *obj)
>      cpu_set_cpustate_pointers(cpu);
>      qdev_property_add_static(DEVICE(obj), &hexagon_lldb_compat_property);
>      qdev_property_add_static(DEVICE(obj),
> &hexagon_lldb_stack_adjust_property);
> +    qdev_property_add_static(DEVICE(obj), &hexagon_short_circuit_property);
>  }
> 
>  #include "hw/core/tcg-cpu-ops.h"
> diff --git a/target/hexagon/genptr.c b/target/hexagon/genptr.c
> index aff9ffe37b..5a0f6b5195 100644
> --- a/target/hexagon/genptr.c
> +++ b/target/hexagon/genptr.c
> @@ -45,7 +45,7 @@ TCGv gen_read_preg(TCGv pred, uint8_t num)
> 
>  #define IMMUTABLE (~0)
> 
> -static const target_ulong reg_immut_masks[TOTAL_PER_THREAD_REGS] = {
> +const target_ulong reg_immut_masks[TOTAL_PER_THREAD_REGS] = {
>      [HEX_REG_USR] = 0xc13000c0,
>      [HEX_REG_PC] = IMMUTABLE,
>      [HEX_REG_GP] = 0x3f,
> @@ -70,14 +70,18 @@ static inline void gen_masked_reg_write(TCGv
> new_val, TCGv cur_val,
> 
>  static TCGv get_result_gpr(DisasContext *ctx, int rnum)
>  {
> -    return hex_new_value[rnum];
> +    if (ctx->need_commit) {
> +        return hex_new_value[rnum];
> +    } else {
> +        return hex_gpr[rnum];
> +    }
>  }
> 
>  static TCGv_i64 get_result_gpr_pair(DisasContext *ctx, int rnum)
>  {
>      TCGv_i64 result = tcg_temp_new_i64();
> -    tcg_gen_concat_i32_i64(result, hex_new_value[rnum],
> -                                   hex_new_value[rnum + 1]);
> +    tcg_gen_concat_i32_i64(result, get_result_gpr(ctx, rnum),
> +                                   get_result_gpr(ctx, rnum + 1));
>      return result;
>  }
> 
> @@ -86,7 +90,7 @@ void gen_log_reg_write(DisasContext *ctx, int rnum,
> TCGv val)
>      const target_ulong reg_mask = reg_immut_masks[rnum];
> 
>      gen_masked_reg_write(val, hex_gpr[rnum], reg_mask);
> -    tcg_gen_mov_tl(hex_new_value[rnum], val);
> +    tcg_gen_mov_tl(get_result_gpr(ctx, rnum), val);
>      if (HEX_DEBUG) {
>          /* Do this so HELPER(debug_commit_end) will know */
>          tcg_gen_movi_tl(hex_reg_written[rnum], 1);
> @@ -95,27 +99,15 @@ void gen_log_reg_write(DisasContext *ctx, int rnum,
> TCGv val)
> 
>  static void gen_log_reg_write_pair(DisasContext *ctx, int rnum, TCGv_i64 val)
>  {
> -    const target_ulong reg_mask_low = reg_immut_masks[rnum];
> -    const target_ulong reg_mask_high = reg_immut_masks[rnum + 1];
>      TCGv val32 = tcg_temp_new();
> 
>      /* Low word */
>      tcg_gen_extrl_i64_i32(val32, val);
> -    gen_masked_reg_write(val32, hex_gpr[rnum], reg_mask_low);
> -    tcg_gen_mov_tl(hex_new_value[rnum], val32);
> -    if (HEX_DEBUG) {
> -        /* Do this so HELPER(debug_commit_end) will know */
> -        tcg_gen_movi_tl(hex_reg_written[rnum], 1);
> -    }
> +    gen_log_reg_write(ctx, rnum, val32);
> 
>      /* High word */
>      tcg_gen_extrh_i64_i32(val32, val);
> -    gen_masked_reg_write(val32, hex_gpr[rnum + 1], reg_mask_high);
> -    tcg_gen_mov_tl(hex_new_value[rnum + 1], val32);
> -    if (HEX_DEBUG) {
> -        /* Do this so HELPER(debug_commit_end) will know */
> -        tcg_gen_movi_tl(hex_reg_written[rnum + 1], 1);
> -    }
> +    gen_log_reg_write(ctx, rnum + 1, val32);
>  }
> 
>  void gen_log_pred_write(DisasContext *ctx, int pnum, TCGv val)
> diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
> index 46ccc59106..fc5c30a141 100644
> --- a/target/hexagon/op_helper.c
> +++ b/target/hexagon/op_helper.c
> @@ -220,7 +220,7 @@ void HELPER(debug_commit_end)(CPUHexagonState
> *env, int has_st0, int has_st1)
>                  reg_printed = true;
>              }
>              HEX_DEBUG_LOG("\tr%d = " TARGET_FMT_ld " (0x" TARGET_FMT_lx
> ")\n",
> -                          i, env->new_value[i], env->new_value[i]);
> +                          i, env->gpr[i], env->gpr[i]);
>          }
>      }
> 
> @@ -352,7 +352,8 @@ uint64_t HELPER(sfinvsqrta)(CPUHexagonState *env,
> float32 RsV)
>  }
> 
>  int64_t HELPER(vacsh_val)(CPUHexagonState *env,
> -                           int64_t RxxV, int64_t RssV, int64_t RttV)
> +                           int64_t RxxV, int64_t RssV, int64_t RttV,
> +                           uint32_t pkt_need_commit)
>  {
>      for (int i = 0; i < 4; i++) {
>          int xv = sextract64(RxxV, i * 16, 16);
> diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
> index 023fc9be1e..5bd71bdcaf 100644
> --- a/target/hexagon/translate.c
> +++ b/target/hexagon/translate.c
> @@ -27,6 +27,7 @@
>  #include "insn.h"
>  #include "decode.h"
>  #include "translate.h"
> +#include "genptr.h"
>  #include "printinsn.h"
> 
>  #include "analyze_funcs_generated.c.inc"
> @@ -239,7 +240,7 @@ static int read_packet_words(CPUHexagonState *env,
> DisasContext *ctx,
>      return nwords;
>  }
> 
> -static G_GNUC_UNUSED bool check_for_attrib(Packet *pkt, int attrib)
> +static bool check_for_attrib(Packet *pkt, int attrib)
>  {
>      for (int i = 0; i < pkt->num_insns; i++) {
>          if (GET_ATTRIB(pkt->insn[i].opcode, attrib)) {
> @@ -336,6 +337,58 @@ static void mark_implicit_pred_writes(DisasContext
> *ctx)
>      mark_implicit_pred_write(ctx, A_IMPLICIT_WRITES_P3, 3);
>  }
> 
> +static bool pkt_raises_exception(Packet *pkt)
> +{
> +    if (check_for_attrib(pkt, A_LOAD) ||
> +        check_for_attrib(pkt, A_STORE)) {
> +        return true;
> +    }
> +    return false;
> +}
> +
> +static bool need_commit(DisasContext *ctx)
> +{
> +    Packet *pkt = ctx->pkt;
> +
> +    /*
> +     * If the short-circuit property is set to false, we'll always do the commit
> +     */
> +    if (!ctx->short_circuit) {
> +        return true;
> +    }
> +
> +    if (pkt_raises_exception(pkt)) {
> +        return true;
> +    }
> +
> +    /* Registers with immutability flags require new_value */
> +    for (int i = 0; i < ctx->reg_log_idx; i++) {
> +        int rnum = ctx->reg_log[i];
> +        if (reg_immut_masks[rnum]) {
> +            return true;
> +        }
> +    }
> +
> +    /* Floating point instructions are hard-coded to use new_value */
> +    if (check_for_attrib(pkt, A_FPOP)) {
> +        return true;
> +    }
> +
> +    if (pkt->num_insns == 1) {
> +        return false;
> +    }
> +
> +    /* Check for overlap between register reads and writes */
> +    for (int i = 0; i < ctx->reg_log_idx; i++) {
> +        int rnum = ctx->reg_log[i];
> +        if (test_bit(rnum, ctx->regs_read)) {
> +            return true;
> +        }
> +    }
> +
> +    return false;
> +}
> +
>  static void mark_implicit_pred_read(DisasContext *ctx, int attrib, int pnum)
>  {
>      if (GET_ATTRIB(ctx->insn->opcode, attrib)) {
> @@ -365,6 +418,8 @@ static void analyze_packet(DisasContext *ctx)
>          mark_implicit_pred_writes(ctx);
>          mark_implicit_pred_reads(ctx);
>      }
> +
> +    ctx->need_commit = need_commit(ctx);
>  }
> 
>  static void gen_start_packet(DisasContext *ctx)
> @@ -434,7 +489,8 @@ static void gen_start_packet(DisasContext *ctx)
>      }
> 
>      /* Preload the predicated registers into hex_new_value[i] */
> -    if (!bitmap_empty(ctx->predicated_regs, TOTAL_PER_THREAD_REGS)) {
> +    if (ctx->need_commit &&
> +        !bitmap_empty(ctx->predicated_regs, TOTAL_PER_THREAD_REGS)) {
>          int i = find_first_bit(ctx->predicated_regs, TOTAL_PER_THREAD_REGS);
>          while (i < TOTAL_PER_THREAD_REGS) {
>              tcg_gen_mov_tl(hex_new_value[i], hex_gpr[i]);
> @@ -541,6 +597,11 @@ static void gen_reg_writes(DisasContext *ctx)
>  {
>      int i;
> 
> +    /* Early exit if not needed */
> +    if (!ctx->need_commit) {
> +        return;
> +    }
> +
>      for (i = 0; i < ctx->reg_log_idx; i++) {
>          int reg_num = ctx->reg_log[i];
> 
> @@ -919,6 +980,7 @@ static void
> hexagon_tr_init_disas_context(DisasContextBase *dcbase,
>                                            CPUState *cs)
>  {
>      DisasContext *ctx = container_of(dcbase, DisasContext, base);
> +    HexagonCPU *hex_cpu = env_archcpu(cs->env_ptr);
>      uint32_t hex_flags = dcbase->tb->flags;
> 
>      ctx->mem_idx = MMU_USER_IDX;
> @@ -927,6 +989,7 @@ static void
> hexagon_tr_init_disas_context(DisasContextBase *dcbase,
>      ctx->num_hvx_insns = 0;
>      ctx->branch_cond = TCG_COND_NEVER;
>      ctx->is_tight_loop = FIELD_EX32(hex_flags, TB_FLAGS, IS_TIGHT_LOOP);
> +    ctx->short_circuit = hex_cpu->short_circuit;
>  }
> 
>  static void hexagon_tr_tb_start(DisasContextBase *db, CPUState *cpu)
> diff --git a/target/hexagon/gen_helper_funcs.py
> b/target/hexagon/gen_helper_funcs.py
> index c73d792580..e259ea3d03 100755
> --- a/target/hexagon/gen_helper_funcs.py
> +++ b/target/hexagon/gen_helper_funcs.py
> @@ -287,6 +287,8 @@ def gen_helper_function(f, tag, tagregs, tagimms):
> 
>          if hex_common.need_pkt_has_multi_cof(tag):
>              f.write(", uint32_t pkt_has_multi_cof")
> +        if (hex_common.need_pkt_need_commit(tag)):
> +            f.write(", uint32_t pkt_need_commit")
> 
>          if hex_common.need_PC(tag):
>              if i > 0:
> diff --git a/target/hexagon/gen_helper_protos.py
> b/target/hexagon/gen_helper_protos.py
> index 187cd6e04e..c5ecb85294 100755
> --- a/target/hexagon/gen_helper_protos.py
> +++ b/target/hexagon/gen_helper_protos.py
> @@ -86,6 +86,8 @@ def gen_helper_prototype(f, tag, tagregs, tagimms):
>              def_helper_size = len(regs) + len(imms) + numscalarreadwrite + 1
>              if hex_common.need_pkt_has_multi_cof(tag):
>                  def_helper_size += 1
> +            if hex_common.need_pkt_need_commit(tag):
> +                def_helper_size += 1
>              if hex_common.need_part1(tag):
>                  def_helper_size += 1
>              if hex_common.need_slot(tag):
> @@ -103,6 +105,8 @@ def gen_helper_prototype(f, tag, tagregs, tagimms):
>              def_helper_size = len(regs) + len(imms) + numscalarreadwrite
>              if hex_common.need_pkt_has_multi_cof(tag):
>                  def_helper_size += 1
> +            if hex_common.need_pkt_need_commit(tag):
> +                def_helper_size += 1
>              if hex_common.need_part1(tag):
>                  def_helper_size += 1
>              if hex_common.need_slot(tag):
> @@ -156,10 +160,12 @@ def gen_helper_prototype(f, tag, tagregs, tagimms):
>          for immlett, bits, immshift in imms:
>              f.write(", s32")
> 
> -        ## Add the arguments for the instruction pkt_has_multi_cof, slot and
> -        ## part1 (if needed)
> +        ## Add the arguments for the instruction pkt_has_multi_cof,
> +        ## pkt_needs_commit, PC, next_PC, slot, and part1 (if needed)
>          if hex_common.need_pkt_has_multi_cof(tag):
>              f.write(", i32")
> +        if hex_common.need_pkt_need_commit(tag):
> +            f.write(', i32')
>          if hex_common.need_PC(tag):
>              f.write(", i32")
>          if hex_common.helper_needs_next_PC(tag):
> diff --git a/target/hexagon/gen_idef_parser_funcs.py
> b/target/hexagon/gen_idef_parser_funcs.py
> index afe68bdb6f..b7f2df0f36 100644
> --- a/target/hexagon/gen_idef_parser_funcs.py
> +++ b/target/hexagon/gen_idef_parser_funcs.py
> @@ -109,6 +109,13 @@ def main():
>                  continue
>              if "A_COF" in hex_common.attribdict[tag]:
>                  continue
> +            ## Skip instructions that are incompatible with short-circuit
> +            ## packet register writes
> +            if ( tag == 'S2_insert' or
> +                 tag == 'S2_insert_rp' or
> +                 tag == 'S2_asr_r_svw_trun' or
> +                 tag == 'A2_swiz' ):
> +                continue
> 
>              regs = tagregs[tag]
>              imms = tagimms[tag]
> diff --git a/target/hexagon/gen_tcg_funcs.py
> b/target/hexagon/gen_tcg_funcs.py
> index d9ccbe63f6..0e45d43685 100755
> --- a/target/hexagon/gen_tcg_funcs.py
> +++ b/target/hexagon/gen_tcg_funcs.py
> @@ -550,6 +550,9 @@ def gen_tcg_func(f, tag, regs, imms):
>          if hex_common.need_pkt_has_multi_cof(tag):
>              f.write("    TCGv pkt_has_multi_cof = ")
>              f.write("tcg_constant_tl(ctx->pkt->pkt_has_multi_cof);\n")
> +        if hex_common.need_pkt_need_commit(tag):
> +            f.write("    TCGv pkt_need_commit = ")
> +            f.write("tcg_constant_tl(ctx->need_commit);\n")
>          if hex_common.need_part1(tag):
>              f.write("    TCGv part1 = tcg_constant_tl(insn->part1);\n")
>          if hex_common.need_slot(tag):
> @@ -596,6 +599,8 @@ def gen_tcg_func(f, tag, regs, imms):
> 
>          if hex_common.need_pkt_has_multi_cof(tag):
>              f.write(", pkt_has_multi_cof")
> +        if hex_common.need_pkt_need_commit(tag):
> +            f.write(", pkt_need_commit")
>          if hex_common.need_PC(tag):
>              f.write(", PC")
>          if hex_common.helper_needs_next_PC(tag):
> diff --git a/target/hexagon/hex_common.py b/target/hexagon/hex_common.py
> index 232c6e2c20..29c0508f66 100755
> --- a/target/hexagon/hex_common.py
> +++ b/target/hexagon/hex_common.py
> @@ -276,6 +276,9 @@ def need_pkt_has_multi_cof(tag):
>      return "A_COF" in attribdict[tag]
> 
> 
> +def need_pkt_need_commit(tag):
> +    return 'A_IMPLICIT_WRITES_USR' in attribdict[tag]
> +
>  def need_condexec_reg(tag, regs):
>      if "A_CONDEXEC" in attribdict[tag]:
>          for regtype, regid, toss, numregs in regs:
> --
> 2.25.1

Reviewed-by: Brian Cain <bcain@quicinc.com>
diff mbox series

Patch

diff --git a/target/hexagon/cpu.h b/target/hexagon/cpu.h
index 81b663ecfb..9252055a38 100644
--- a/target/hexagon/cpu.h
+++ b/target/hexagon/cpu.h
@@ -146,6 +146,7 @@  struct ArchCPU {
 
     bool lldb_compat;
     target_ulong lldb_stack_adjust;
+    bool short_circuit;
 };
 
 #include "cpu_bits.h"
diff --git a/target/hexagon/gen_tcg.h b/target/hexagon/gen_tcg.h
index 2b2a6175a5..1f7e535300 100644
--- a/target/hexagon/gen_tcg.h
+++ b/target/hexagon/gen_tcg.h
@@ -592,7 +592,8 @@ 
 #define fGEN_TCG_A5_ACS(SHORTCODE) \
     do { \
         gen_helper_vacsh_pred(PeV, cpu_env, RxxV, RssV, RttV); \
-        gen_helper_vacsh_val(RxxV, cpu_env, RxxV, RssV, RttV); \
+        gen_helper_vacsh_val(RxxV, cpu_env, RxxV, RssV, RttV, \
+                             tcg_constant_tl(ctx->need_commit)); \
     } while (0)
 
 #define fGEN_TCG_S2_cabacdecbin(SHORTCODE) \
diff --git a/target/hexagon/genptr.h b/target/hexagon/genptr.h
index 75d0fc262d..420867f934 100644
--- a/target/hexagon/genptr.h
+++ b/target/hexagon/genptr.h
@@ -58,4 +58,6 @@  void gen_set_half(int N, TCGv result, TCGv src);
 void gen_set_half_i64(int N, TCGv_i64 result, TCGv src);
 void probe_noshuf_load(TCGv va, int s, int mi);
 
+extern const target_ulong reg_immut_masks[TOTAL_PER_THREAD_REGS];
+
 #endif
diff --git a/target/hexagon/helper.h b/target/hexagon/helper.h
index 73849e3d49..4b750d0351 100644
--- a/target/hexagon/helper.h
+++ b/target/hexagon/helper.h
@@ -29,7 +29,7 @@  DEF_HELPER_FLAGS_4(fcircadd, TCG_CALL_NO_RWG_SE, s32, s32, s32, s32, s32)
 DEF_HELPER_FLAGS_1(fbrev, TCG_CALL_NO_RWG_SE, i32, i32)
 DEF_HELPER_3(sfrecipa, i64, env, f32, f32)
 DEF_HELPER_2(sfinvsqrta, i64, env, f32)
-DEF_HELPER_4(vacsh_val, s64, env, s64, s64, s64)
+DEF_HELPER_5(vacsh_val, s64, env, s64, s64, s64, i32)
 DEF_HELPER_FLAGS_4(vacsh_pred, TCG_CALL_NO_RWG_SE, s32, env, s64, s64, s64)
 DEF_HELPER_FLAGS_2(cabacdecbin_val, TCG_CALL_NO_RWG_SE, s64, s64, s64)
 DEF_HELPER_FLAGS_2(cabacdecbin_pred, TCG_CALL_NO_RWG_SE, s32, s64, s64)
diff --git a/target/hexagon/macros.h b/target/hexagon/macros.h
index 16e72ed0d5..a68446a367 100644
--- a/target/hexagon/macros.h
+++ b/target/hexagon/macros.h
@@ -44,8 +44,17 @@ 
                    reg_field_info[FIELD].offset)
 
 #define SET_USR_FIELD(FIELD, VAL) \
-    fINSERT_BITS(env->new_value[HEX_REG_USR], reg_field_info[FIELD].width, \
-                 reg_field_info[FIELD].offset, (VAL))
+    do { \
+        if (pkt_need_commit) { \
+            fINSERT_BITS(env->new_value[HEX_REG_USR], \
+                        reg_field_info[FIELD].width, \
+                        reg_field_info[FIELD].offset, (VAL)); \
+        } else { \
+            fINSERT_BITS(env->gpr[HEX_REG_USR], \
+                        reg_field_info[FIELD].width, \
+                        reg_field_info[FIELD].offset, (VAL)); \
+        } \
+    } while (0)
 #endif
 
 #ifdef QEMU_GENERATE
diff --git a/target/hexagon/translate.h b/target/hexagon/translate.h
index f72228859f..3f6fd3452c 100644
--- a/target/hexagon/translate.h
+++ b/target/hexagon/translate.h
@@ -62,10 +62,12 @@  typedef struct DisasContext {
     int qreg_log_idx;
     DECLARE_BITMAP(qregs_read, NUM_QREGS);
     bool pre_commit;
+    bool need_commit;
     TCGCond branch_cond;
     target_ulong branch_dest;
     bool is_tight_loop;
     bool need_pkt_has_store_s1;
+    bool short_circuit;
 } DisasContext;
 
 static inline void ctx_log_pred_write(DisasContext *ctx, int pnum)
diff --git a/target/hexagon/arch.c b/target/hexagon/arch.c
index da79b41c4d..d053d68487 100644
--- a/target/hexagon/arch.c
+++ b/target/hexagon/arch.c
@@ -1,5 +1,5 @@ 
 /*
- *  Copyright(c) 2019-2022 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -224,6 +224,7 @@  void arch_fpop_start(CPUHexagonState *env)
 
 void arch_fpop_end(CPUHexagonState *env)
 {
+    const bool pkt_need_commit = true;
     int flags = get_float_exception_flags(&env->fp_status);
     if (flags != 0) {
         SOFTFLOAT_TEST_FLAG(float_flag_inexact, FPINPF, FPINPE);
diff --git a/target/hexagon/cpu.c b/target/hexagon/cpu.c
index ab40cfc283..4adf90dcfa 100644
--- a/target/hexagon/cpu.c
+++ b/target/hexagon/cpu.c
@@ -1,5 +1,5 @@ 
 /*
- *  Copyright(c) 2019-2021 Qualcomm Innovation Center, Inc. All Rights Reserved.
+ *  Copyright(c) 2019-2023 Qualcomm Innovation Center, Inc. All Rights Reserved.
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -52,6 +52,8 @@  static Property hexagon_lldb_compat_property =
 static Property hexagon_lldb_stack_adjust_property =
     DEFINE_PROP_UNSIGNED("lldb-stack-adjust", HexagonCPU, lldb_stack_adjust,
                          0, qdev_prop_uint32, target_ulong);
+static Property hexagon_short_circuit_property =
+    DEFINE_PROP_BOOL("short-circuit", HexagonCPU, short_circuit, true);
 
 const char * const hexagon_regnames[TOTAL_PER_THREAD_REGS] = {
    "r0", "r1",  "r2",  "r3",  "r4",   "r5",  "r6",  "r7",
@@ -328,6 +330,7 @@  static void hexagon_cpu_init(Object *obj)
     cpu_set_cpustate_pointers(cpu);
     qdev_property_add_static(DEVICE(obj), &hexagon_lldb_compat_property);
     qdev_property_add_static(DEVICE(obj), &hexagon_lldb_stack_adjust_property);
+    qdev_property_add_static(DEVICE(obj), &hexagon_short_circuit_property);
 }
 
 #include "hw/core/tcg-cpu-ops.h"
diff --git a/target/hexagon/genptr.c b/target/hexagon/genptr.c
index aff9ffe37b..5a0f6b5195 100644
--- a/target/hexagon/genptr.c
+++ b/target/hexagon/genptr.c
@@ -45,7 +45,7 @@  TCGv gen_read_preg(TCGv pred, uint8_t num)
 
 #define IMMUTABLE (~0)
 
-static const target_ulong reg_immut_masks[TOTAL_PER_THREAD_REGS] = {
+const target_ulong reg_immut_masks[TOTAL_PER_THREAD_REGS] = {
     [HEX_REG_USR] = 0xc13000c0,
     [HEX_REG_PC] = IMMUTABLE,
     [HEX_REG_GP] = 0x3f,
@@ -70,14 +70,18 @@  static inline void gen_masked_reg_write(TCGv new_val, TCGv cur_val,
 
 static TCGv get_result_gpr(DisasContext *ctx, int rnum)
 {
-    return hex_new_value[rnum];
+    if (ctx->need_commit) {
+        return hex_new_value[rnum];
+    } else {
+        return hex_gpr[rnum];
+    }
 }
 
 static TCGv_i64 get_result_gpr_pair(DisasContext *ctx, int rnum)
 {
     TCGv_i64 result = tcg_temp_new_i64();
-    tcg_gen_concat_i32_i64(result, hex_new_value[rnum],
-                                   hex_new_value[rnum + 1]);
+    tcg_gen_concat_i32_i64(result, get_result_gpr(ctx, rnum),
+                                   get_result_gpr(ctx, rnum + 1));
     return result;
 }
 
@@ -86,7 +90,7 @@  void gen_log_reg_write(DisasContext *ctx, int rnum, TCGv val)
     const target_ulong reg_mask = reg_immut_masks[rnum];
 
     gen_masked_reg_write(val, hex_gpr[rnum], reg_mask);
-    tcg_gen_mov_tl(hex_new_value[rnum], val);
+    tcg_gen_mov_tl(get_result_gpr(ctx, rnum), val);
     if (HEX_DEBUG) {
         /* Do this so HELPER(debug_commit_end) will know */
         tcg_gen_movi_tl(hex_reg_written[rnum], 1);
@@ -95,27 +99,15 @@  void gen_log_reg_write(DisasContext *ctx, int rnum, TCGv val)
 
 static void gen_log_reg_write_pair(DisasContext *ctx, int rnum, TCGv_i64 val)
 {
-    const target_ulong reg_mask_low = reg_immut_masks[rnum];
-    const target_ulong reg_mask_high = reg_immut_masks[rnum + 1];
     TCGv val32 = tcg_temp_new();
 
     /* Low word */
     tcg_gen_extrl_i64_i32(val32, val);
-    gen_masked_reg_write(val32, hex_gpr[rnum], reg_mask_low);
-    tcg_gen_mov_tl(hex_new_value[rnum], val32);
-    if (HEX_DEBUG) {
-        /* Do this so HELPER(debug_commit_end) will know */
-        tcg_gen_movi_tl(hex_reg_written[rnum], 1);
-    }
+    gen_log_reg_write(ctx, rnum, val32);
 
     /* High word */
     tcg_gen_extrh_i64_i32(val32, val);
-    gen_masked_reg_write(val32, hex_gpr[rnum + 1], reg_mask_high);
-    tcg_gen_mov_tl(hex_new_value[rnum + 1], val32);
-    if (HEX_DEBUG) {
-        /* Do this so HELPER(debug_commit_end) will know */
-        tcg_gen_movi_tl(hex_reg_written[rnum + 1], 1);
-    }
+    gen_log_reg_write(ctx, rnum + 1, val32);
 }
 
 void gen_log_pred_write(DisasContext *ctx, int pnum, TCGv val)
diff --git a/target/hexagon/op_helper.c b/target/hexagon/op_helper.c
index 46ccc59106..fc5c30a141 100644
--- a/target/hexagon/op_helper.c
+++ b/target/hexagon/op_helper.c
@@ -220,7 +220,7 @@  void HELPER(debug_commit_end)(CPUHexagonState *env, int has_st0, int has_st1)
                 reg_printed = true;
             }
             HEX_DEBUG_LOG("\tr%d = " TARGET_FMT_ld " (0x" TARGET_FMT_lx ")\n",
-                          i, env->new_value[i], env->new_value[i]);
+                          i, env->gpr[i], env->gpr[i]);
         }
     }
 
@@ -352,7 +352,8 @@  uint64_t HELPER(sfinvsqrta)(CPUHexagonState *env, float32 RsV)
 }
 
 int64_t HELPER(vacsh_val)(CPUHexagonState *env,
-                           int64_t RxxV, int64_t RssV, int64_t RttV)
+                           int64_t RxxV, int64_t RssV, int64_t RttV,
+                           uint32_t pkt_need_commit)
 {
     for (int i = 0; i < 4; i++) {
         int xv = sextract64(RxxV, i * 16, 16);
diff --git a/target/hexagon/translate.c b/target/hexagon/translate.c
index 023fc9be1e..5bd71bdcaf 100644
--- a/target/hexagon/translate.c
+++ b/target/hexagon/translate.c
@@ -27,6 +27,7 @@ 
 #include "insn.h"
 #include "decode.h"
 #include "translate.h"
+#include "genptr.h"
 #include "printinsn.h"
 
 #include "analyze_funcs_generated.c.inc"
@@ -239,7 +240,7 @@  static int read_packet_words(CPUHexagonState *env, DisasContext *ctx,
     return nwords;
 }
 
-static G_GNUC_UNUSED bool check_for_attrib(Packet *pkt, int attrib)
+static bool check_for_attrib(Packet *pkt, int attrib)
 {
     for (int i = 0; i < pkt->num_insns; i++) {
         if (GET_ATTRIB(pkt->insn[i].opcode, attrib)) {
@@ -336,6 +337,58 @@  static void mark_implicit_pred_writes(DisasContext *ctx)
     mark_implicit_pred_write(ctx, A_IMPLICIT_WRITES_P3, 3);
 }
 
+static bool pkt_raises_exception(Packet *pkt)
+{
+    if (check_for_attrib(pkt, A_LOAD) ||
+        check_for_attrib(pkt, A_STORE)) {
+        return true;
+    }
+    return false;
+}
+
+static bool need_commit(DisasContext *ctx)
+{
+    Packet *pkt = ctx->pkt;
+
+    /*
+     * If the short-circuit property is set to false, we'll always do the commit
+     */
+    if (!ctx->short_circuit) {
+        return true;
+    }
+
+    if (pkt_raises_exception(pkt)) {
+        return true;
+    }
+
+    /* Registers with immutability flags require new_value */
+    for (int i = 0; i < ctx->reg_log_idx; i++) {
+        int rnum = ctx->reg_log[i];
+        if (reg_immut_masks[rnum]) {
+            return true;
+        }
+    }
+
+    /* Floating point instructions are hard-coded to use new_value */
+    if (check_for_attrib(pkt, A_FPOP)) {
+        return true;
+    }
+
+    if (pkt->num_insns == 1) {
+        return false;
+    }
+
+    /* Check for overlap between register reads and writes */
+    for (int i = 0; i < ctx->reg_log_idx; i++) {
+        int rnum = ctx->reg_log[i];
+        if (test_bit(rnum, ctx->regs_read)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 static void mark_implicit_pred_read(DisasContext *ctx, int attrib, int pnum)
 {
     if (GET_ATTRIB(ctx->insn->opcode, attrib)) {
@@ -365,6 +418,8 @@  static void analyze_packet(DisasContext *ctx)
         mark_implicit_pred_writes(ctx);
         mark_implicit_pred_reads(ctx);
     }
+
+    ctx->need_commit = need_commit(ctx);
 }
 
 static void gen_start_packet(DisasContext *ctx)
@@ -434,7 +489,8 @@  static void gen_start_packet(DisasContext *ctx)
     }
 
     /* Preload the predicated registers into hex_new_value[i] */
-    if (!bitmap_empty(ctx->predicated_regs, TOTAL_PER_THREAD_REGS)) {
+    if (ctx->need_commit &&
+        !bitmap_empty(ctx->predicated_regs, TOTAL_PER_THREAD_REGS)) {
         int i = find_first_bit(ctx->predicated_regs, TOTAL_PER_THREAD_REGS);
         while (i < TOTAL_PER_THREAD_REGS) {
             tcg_gen_mov_tl(hex_new_value[i], hex_gpr[i]);
@@ -541,6 +597,11 @@  static void gen_reg_writes(DisasContext *ctx)
 {
     int i;
 
+    /* Early exit if not needed */
+    if (!ctx->need_commit) {
+        return;
+    }
+
     for (i = 0; i < ctx->reg_log_idx; i++) {
         int reg_num = ctx->reg_log[i];
 
@@ -919,6 +980,7 @@  static void hexagon_tr_init_disas_context(DisasContextBase *dcbase,
                                           CPUState *cs)
 {
     DisasContext *ctx = container_of(dcbase, DisasContext, base);
+    HexagonCPU *hex_cpu = env_archcpu(cs->env_ptr);
     uint32_t hex_flags = dcbase->tb->flags;
 
     ctx->mem_idx = MMU_USER_IDX;
@@ -927,6 +989,7 @@  static void hexagon_tr_init_disas_context(DisasContextBase *dcbase,
     ctx->num_hvx_insns = 0;
     ctx->branch_cond = TCG_COND_NEVER;
     ctx->is_tight_loop = FIELD_EX32(hex_flags, TB_FLAGS, IS_TIGHT_LOOP);
+    ctx->short_circuit = hex_cpu->short_circuit;
 }
 
 static void hexagon_tr_tb_start(DisasContextBase *db, CPUState *cpu)
diff --git a/target/hexagon/gen_helper_funcs.py b/target/hexagon/gen_helper_funcs.py
index c73d792580..e259ea3d03 100755
--- a/target/hexagon/gen_helper_funcs.py
+++ b/target/hexagon/gen_helper_funcs.py
@@ -287,6 +287,8 @@  def gen_helper_function(f, tag, tagregs, tagimms):
 
         if hex_common.need_pkt_has_multi_cof(tag):
             f.write(", uint32_t pkt_has_multi_cof")
+        if (hex_common.need_pkt_need_commit(tag)):
+            f.write(", uint32_t pkt_need_commit")
 
         if hex_common.need_PC(tag):
             if i > 0:
diff --git a/target/hexagon/gen_helper_protos.py b/target/hexagon/gen_helper_protos.py
index 187cd6e04e..c5ecb85294 100755
--- a/target/hexagon/gen_helper_protos.py
+++ b/target/hexagon/gen_helper_protos.py
@@ -86,6 +86,8 @@  def gen_helper_prototype(f, tag, tagregs, tagimms):
             def_helper_size = len(regs) + len(imms) + numscalarreadwrite + 1
             if hex_common.need_pkt_has_multi_cof(tag):
                 def_helper_size += 1
+            if hex_common.need_pkt_need_commit(tag):
+                def_helper_size += 1
             if hex_common.need_part1(tag):
                 def_helper_size += 1
             if hex_common.need_slot(tag):
@@ -103,6 +105,8 @@  def gen_helper_prototype(f, tag, tagregs, tagimms):
             def_helper_size = len(regs) + len(imms) + numscalarreadwrite
             if hex_common.need_pkt_has_multi_cof(tag):
                 def_helper_size += 1
+            if hex_common.need_pkt_need_commit(tag):
+                def_helper_size += 1
             if hex_common.need_part1(tag):
                 def_helper_size += 1
             if hex_common.need_slot(tag):
@@ -156,10 +160,12 @@  def gen_helper_prototype(f, tag, tagregs, tagimms):
         for immlett, bits, immshift in imms:
             f.write(", s32")
 
-        ## Add the arguments for the instruction pkt_has_multi_cof, slot and
-        ## part1 (if needed)
+        ## Add the arguments for the instruction pkt_has_multi_cof,
+        ## pkt_needs_commit, PC, next_PC, slot, and part1 (if needed)
         if hex_common.need_pkt_has_multi_cof(tag):
             f.write(", i32")
+        if hex_common.need_pkt_need_commit(tag):
+            f.write(', i32')
         if hex_common.need_PC(tag):
             f.write(", i32")
         if hex_common.helper_needs_next_PC(tag):
diff --git a/target/hexagon/gen_idef_parser_funcs.py b/target/hexagon/gen_idef_parser_funcs.py
index afe68bdb6f..b7f2df0f36 100644
--- a/target/hexagon/gen_idef_parser_funcs.py
+++ b/target/hexagon/gen_idef_parser_funcs.py
@@ -109,6 +109,13 @@  def main():
                 continue
             if "A_COF" in hex_common.attribdict[tag]:
                 continue
+            ## Skip instructions that are incompatible with short-circuit
+            ## packet register writes
+            if ( tag == 'S2_insert' or
+                 tag == 'S2_insert_rp' or
+                 tag == 'S2_asr_r_svw_trun' or
+                 tag == 'A2_swiz' ):
+                continue
 
             regs = tagregs[tag]
             imms = tagimms[tag]
diff --git a/target/hexagon/gen_tcg_funcs.py b/target/hexagon/gen_tcg_funcs.py
index d9ccbe63f6..0e45d43685 100755
--- a/target/hexagon/gen_tcg_funcs.py
+++ b/target/hexagon/gen_tcg_funcs.py
@@ -550,6 +550,9 @@  def gen_tcg_func(f, tag, regs, imms):
         if hex_common.need_pkt_has_multi_cof(tag):
             f.write("    TCGv pkt_has_multi_cof = ")
             f.write("tcg_constant_tl(ctx->pkt->pkt_has_multi_cof);\n")
+        if hex_common.need_pkt_need_commit(tag):
+            f.write("    TCGv pkt_need_commit = ")
+            f.write("tcg_constant_tl(ctx->need_commit);\n")
         if hex_common.need_part1(tag):
             f.write("    TCGv part1 = tcg_constant_tl(insn->part1);\n")
         if hex_common.need_slot(tag):
@@ -596,6 +599,8 @@  def gen_tcg_func(f, tag, regs, imms):
 
         if hex_common.need_pkt_has_multi_cof(tag):
             f.write(", pkt_has_multi_cof")
+        if hex_common.need_pkt_need_commit(tag):
+            f.write(", pkt_need_commit")
         if hex_common.need_PC(tag):
             f.write(", PC")
         if hex_common.helper_needs_next_PC(tag):
diff --git a/target/hexagon/hex_common.py b/target/hexagon/hex_common.py
index 232c6e2c20..29c0508f66 100755
--- a/target/hexagon/hex_common.py
+++ b/target/hexagon/hex_common.py
@@ -276,6 +276,9 @@  def need_pkt_has_multi_cof(tag):
     return "A_COF" in attribdict[tag]
 
 
+def need_pkt_need_commit(tag):
+    return 'A_IMPLICIT_WRITES_USR' in attribdict[tag]
+
 def need_condexec_reg(tag, regs):
     if "A_CONDEXEC" in attribdict[tag]:
         for regtype, regid, toss, numregs in regs: