diff mbox series

[RFC,1/1] target/riscv: use tcg ops generation to emulate whole reg rvv loads/stores.

Message ID 20241218170840.1090473-2-paolo.savini@embecosm.com (mailing list archive)
State New
Headers show
Series target/riscv: use tcg ops generation to emulate whole reg rvv loads/stores. | expand

Commit Message

Paolo Savini Dec. 18, 2024, 5:08 p.m. UTC
This patch aims at emulating the whole register loads and stores through
direct generation of tcg operations rather than through the aid of a helper
function.

Signed-off-by: Paolo Savini <paolo.savini@embecosm.com>
---
 target/riscv/insn_trans/trans_rvv.c.inc | 104 +++++++++++++-----------
 1 file changed, 56 insertions(+), 48 deletions(-)

Comments

Richard Henderson Dec. 18, 2024, 7:08 p.m. UTC | #1
On 12/18/24 11:08, Paolo Savini wrote:
> This patch aims at emulating the whole register loads and stores through
> direct generation of tcg operations rather than through the aid of a helper
> function.
> 
> Signed-off-by: Paolo Savini <paolo.savini@embecosm.com>
> ---
>   target/riscv/insn_trans/trans_rvv.c.inc | 104 +++++++++++++-----------
>   1 file changed, 56 insertions(+), 48 deletions(-)
> 
> diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc
> index b9883a5d32..63d8b05bf1 100644
> --- a/target/riscv/insn_trans/trans_rvv.c.inc
> +++ b/target/riscv/insn_trans/trans_rvv.c.inc
> @@ -1100,26 +1100,34 @@ GEN_VEXT_TRANS(vle64ff_v, MO_64, r2nfvm, ldff_op, ld_us_check)
>   typedef void gen_helper_ldst_whole(TCGv_ptr, TCGv, TCGv_env, TCGv_i32);
>   
>   static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
> -                             gen_helper_ldst_whole *fn,
> -                             DisasContext *s)
> +                             uint32_t log2_esz, gen_helper_ldst_whole *fn,
> +                             DisasContext *s, bool is_load)
>   {
> -    TCGv_ptr dest;
> -    TCGv base;
> -    TCGv_i32 desc;
> -
> -    uint32_t data = FIELD_DP32(0, VDATA, NF, nf);
> -    data = FIELD_DP32(data, VDATA, VM, 1);
> -    dest = tcg_temp_new_ptr();
> -    desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
> -                                      s->cfg_ptr->vlenb, data));
> -
> -    base = get_gpr(s, rs1, EXT_NONE);
> -    tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
> +    TCGv addr;
> +    addr = get_address(s, rs1, 0);
> +    uint32_t size = s->cfg_ptr->vlenb * nf;
>   
> -    mark_vs_dirty(s);	

Did you lose this?  You can't rely on the call from finalize_rvv_inst at the end.  Dirty 
must be already be set for the exception path if any of these memory operations fault.

> -
> -    fn(dest, base, tcg_env, desc);
> +    TCGv_i128 t16 = tcg_temp_new_i128();
>   
> +    /*
> +     * Load/store minimum vlenb bytes per iteration.
> +     * When possible do this atomically.
> +     * Update vstart with the number of processed elements.
> +     */
> +    for (int i=0; i < size; i+=16) {

Spaces.  scripts/checkpatch.pl should have complained.

> +        addr = get_address(s, rs1, i);
> +        if (is_load) {
> +	    tcg_gen_qemu_ld_i128(t16, addr, s->mem_idx,
> +		    MO_LE | MO_128 | MO_ATOM_WITHIN16);

FWIW, for MO_128, MO_ATOM_WITHIN16 is equivalent to MO_ATOM_IFALIGN, which is the default.

Where you can improve things if by specifying *less* atomicity.  For instance:

   log2_esz == MO_8:  MO_ATOM_NONE
   When operating on bytes, no larger atomicity is required.

   log2_esz <= MO_64: MO_ATOM_IFALIGN_PAIR
   Require atomicity on the two MO_64 halves, but no more than that.

I don't currently support further granularity, because there doesn't seem to be a need, 
and it's already somewhat confusing.

There's no handling for vstart != 0.
You could easily fall back to the helper for !s->vstart_eq_zero.

> +	    tcg_gen_st_i128(t16, tcg_env, vreg_ofs(s, vd) + i);
> +	    tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 16 >> log2_esz);
> +        } else {
> +            tcg_gen_ld_i128(t16, tcg_env, vreg_ofs(s, vd) + i);
> +            tcg_gen_qemu_st_i128(t16, addr, s->mem_idx,
> +		    MO_LE | MO_128 | MO_ATOM_WITHIN16);
> +	    tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 16 >> log2_esz);

The final iteration must set cpu_vstart to 0.

Otherwise this looks quite plausible.


r~
diff mbox series

Patch

diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc
index b9883a5d32..63d8b05bf1 100644
--- a/target/riscv/insn_trans/trans_rvv.c.inc
+++ b/target/riscv/insn_trans/trans_rvv.c.inc
@@ -1100,26 +1100,34 @@  GEN_VEXT_TRANS(vle64ff_v, MO_64, r2nfvm, ldff_op, ld_us_check)
 typedef void gen_helper_ldst_whole(TCGv_ptr, TCGv, TCGv_env, TCGv_i32);
 
 static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
-                             gen_helper_ldst_whole *fn,
-                             DisasContext *s)
+                             uint32_t log2_esz, gen_helper_ldst_whole *fn,
+                             DisasContext *s, bool is_load)
 {
-    TCGv_ptr dest;
-    TCGv base;
-    TCGv_i32 desc;
-
-    uint32_t data = FIELD_DP32(0, VDATA, NF, nf);
-    data = FIELD_DP32(data, VDATA, VM, 1);
-    dest = tcg_temp_new_ptr();
-    desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
-                                      s->cfg_ptr->vlenb, data));
-
-    base = get_gpr(s, rs1, EXT_NONE);
-    tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
+    TCGv addr;
+    addr = get_address(s, rs1, 0);
+    uint32_t size = s->cfg_ptr->vlenb * nf;
 
-    mark_vs_dirty(s);
-
-    fn(dest, base, tcg_env, desc);
+    TCGv_i128 t16 = tcg_temp_new_i128();
 
+    /*
+     * Load/store minimum vlenb bytes per iteration.
+     * When possible do this atomically.
+     * Update vstart with the number of processed elements.
+     */
+    for (int i=0; i < size; i+=16) {
+        addr = get_address(s, rs1, i);
+        if (is_load) {
+	    tcg_gen_qemu_ld_i128(t16, addr, s->mem_idx,
+		    MO_LE | MO_128 | MO_ATOM_WITHIN16);
+	    tcg_gen_st_i128(t16, tcg_env, vreg_ofs(s, vd) + i);
+	    tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 16 >> log2_esz);
+        } else {
+            tcg_gen_ld_i128(t16, tcg_env, vreg_ofs(s, vd) + i);
+            tcg_gen_qemu_st_i128(t16, addr, s->mem_idx,
+		    MO_LE | MO_128 | MO_ATOM_WITHIN16);
+	    tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 16 >> log2_esz);
+        }
+    }
     finalize_rvv_inst(s);
     return true;
 }
@@ -1128,42 +1136,42 @@  static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
  * load and store whole register instructions ignore vtype and vl setting.
  * Thus, we don't need to check vill bit. (Section 7.9)
  */
-#define GEN_LDST_WHOLE_TRANS(NAME, ARG_NF)                                \
-static bool trans_##NAME(DisasContext *s, arg_##NAME * a)                 \
-{                                                                         \
-    if (require_rvv(s) &&                                                 \
-        QEMU_IS_ALIGNED(a->rd, ARG_NF)) {                                 \
-        return ldst_whole_trans(a->rd, a->rs1, ARG_NF,                    \
-                                gen_helper_##NAME, s);                    \
-    }                                                                     \
-    return false;                                                         \
-}
-
-GEN_LDST_WHOLE_TRANS(vl1re8_v,  1)
-GEN_LDST_WHOLE_TRANS(vl1re16_v, 1)
-GEN_LDST_WHOLE_TRANS(vl1re32_v, 1)
-GEN_LDST_WHOLE_TRANS(vl1re64_v, 1)
-GEN_LDST_WHOLE_TRANS(vl2re8_v,  2)
-GEN_LDST_WHOLE_TRANS(vl2re16_v, 2)
-GEN_LDST_WHOLE_TRANS(vl2re32_v, 2)
-GEN_LDST_WHOLE_TRANS(vl2re64_v, 2)
-GEN_LDST_WHOLE_TRANS(vl4re8_v,  4)
-GEN_LDST_WHOLE_TRANS(vl4re16_v, 4)
-GEN_LDST_WHOLE_TRANS(vl4re32_v, 4)
-GEN_LDST_WHOLE_TRANS(vl4re64_v, 4)
-GEN_LDST_WHOLE_TRANS(vl8re8_v,  8)
-GEN_LDST_WHOLE_TRANS(vl8re16_v, 8)
-GEN_LDST_WHOLE_TRANS(vl8re32_v, 8)
-GEN_LDST_WHOLE_TRANS(vl8re64_v, 8)
+#define GEN_LDST_WHOLE_TRANS(NAME, ETYPE, ARG_NF, IS_LOAD)                  \
+static bool trans_##NAME(DisasContext *s, arg_##NAME * a)                   \
+{                                                                           \
+    if (require_rvv(s) &&                                                   \
+        QEMU_IS_ALIGNED(a->rd, ARG_NF)) {                                   \
+        return ldst_whole_trans(a->rd, a->rs1, ARG_NF, ctzl(sizeof(ETYPE)), \
+                                gen_helper_##NAME, s, IS_LOAD);             \
+    }                                                                       \
+    return false;                                                           \
+}
+
+GEN_LDST_WHOLE_TRANS(vl1re8_v,  int8_t,  1, true)
+GEN_LDST_WHOLE_TRANS(vl1re16_v, int16_t, 1, true)
+GEN_LDST_WHOLE_TRANS(vl1re32_v, int32_t, 1, true)
+GEN_LDST_WHOLE_TRANS(vl1re64_v, int64_t, 1, true)
+GEN_LDST_WHOLE_TRANS(vl2re8_v,  int8_t,  2, true)
+GEN_LDST_WHOLE_TRANS(vl2re16_v, int16_t, 2, true)
+GEN_LDST_WHOLE_TRANS(vl2re32_v, int32_t, 2, true)
+GEN_LDST_WHOLE_TRANS(vl2re64_v, int64_t, 2, true)
+GEN_LDST_WHOLE_TRANS(vl4re8_v,  int8_t,  4, true)
+GEN_LDST_WHOLE_TRANS(vl4re16_v, int16_t, 4, true)
+GEN_LDST_WHOLE_TRANS(vl4re32_v, int32_t, 4, true)
+GEN_LDST_WHOLE_TRANS(vl4re64_v, int64_t, 4, true)
+GEN_LDST_WHOLE_TRANS(vl8re8_v,  int8_t,  8, true)
+GEN_LDST_WHOLE_TRANS(vl8re16_v, int16_t, 8, true)
+GEN_LDST_WHOLE_TRANS(vl8re32_v, int32_t, 8, true)
+GEN_LDST_WHOLE_TRANS(vl8re64_v, int64_t, 8, true)
 
 /*
  * The vector whole register store instructions are encoded similar to
  * unmasked unit-stride store of elements with EEW=8.
  */
-GEN_LDST_WHOLE_TRANS(vs1r_v, 1)
-GEN_LDST_WHOLE_TRANS(vs2r_v, 2)
-GEN_LDST_WHOLE_TRANS(vs4r_v, 4)
-GEN_LDST_WHOLE_TRANS(vs8r_v, 8)
+GEN_LDST_WHOLE_TRANS(vs1r_v, int8_t, 1, false)
+GEN_LDST_WHOLE_TRANS(vs2r_v, int8_t, 2, false)
+GEN_LDST_WHOLE_TRANS(vs4r_v, int8_t, 4, false)
+GEN_LDST_WHOLE_TRANS(vs8r_v, int8_t, 8, false)
 
 /*
  *** Vector Integer Arithmetic Instructions