@@ -1100,26 +1100,34 @@ GEN_VEXT_TRANS(vle64ff_v, MO_64, r2nfvm, ldff_op, ld_us_check)
typedef void gen_helper_ldst_whole(TCGv_ptr, TCGv, TCGv_env, TCGv_i32);
static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
- gen_helper_ldst_whole *fn,
- DisasContext *s)
+ uint32_t log2_esz, gen_helper_ldst_whole *fn,
+ DisasContext *s, bool is_load)
{
- TCGv_ptr dest;
- TCGv base;
- TCGv_i32 desc;
-
- uint32_t data = FIELD_DP32(0, VDATA, NF, nf);
- data = FIELD_DP32(data, VDATA, VM, 1);
- dest = tcg_temp_new_ptr();
- desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
- s->cfg_ptr->vlenb, data));
-
- base = get_gpr(s, rs1, EXT_NONE);
- tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
+ TCGv addr;
+ addr = get_address(s, rs1, 0);
+ uint32_t size = s->cfg_ptr->vlenb * nf;
- mark_vs_dirty(s);
-
- fn(dest, base, tcg_env, desc);
+ TCGv_i128 t16 = tcg_temp_new_i128();
+ /*
+ * Load/store minimum vlenb bytes per iteration.
+ * When possible do this atomically.
+ * Update vstart with the number of processed elements.
+ */
+ for (int i=0; i < size; i+=16) {
+ addr = get_address(s, rs1, i);
+ if (is_load) {
+ tcg_gen_qemu_ld_i128(t16, addr, s->mem_idx,
+ MO_LE | MO_128 | MO_ATOM_WITHIN16);
+ tcg_gen_st_i128(t16, tcg_env, vreg_ofs(s, vd) + i);
+ tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 16 >> log2_esz);
+ } else {
+ tcg_gen_ld_i128(t16, tcg_env, vreg_ofs(s, vd) + i);
+ tcg_gen_qemu_st_i128(t16, addr, s->mem_idx,
+ MO_LE | MO_128 | MO_ATOM_WITHIN16);
+ tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 16 >> log2_esz);
+ }
+ }
finalize_rvv_inst(s);
return true;
}
@@ -1128,42 +1136,42 @@ static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
* load and store whole register instructions ignore vtype and vl setting.
* Thus, we don't need to check vill bit. (Section 7.9)
*/
-#define GEN_LDST_WHOLE_TRANS(NAME, ARG_NF) \
-static bool trans_##NAME(DisasContext *s, arg_##NAME * a) \
-{ \
- if (require_rvv(s) && \
- QEMU_IS_ALIGNED(a->rd, ARG_NF)) { \
- return ldst_whole_trans(a->rd, a->rs1, ARG_NF, \
- gen_helper_##NAME, s); \
- } \
- return false; \
-}
-
-GEN_LDST_WHOLE_TRANS(vl1re8_v, 1)
-GEN_LDST_WHOLE_TRANS(vl1re16_v, 1)
-GEN_LDST_WHOLE_TRANS(vl1re32_v, 1)
-GEN_LDST_WHOLE_TRANS(vl1re64_v, 1)
-GEN_LDST_WHOLE_TRANS(vl2re8_v, 2)
-GEN_LDST_WHOLE_TRANS(vl2re16_v, 2)
-GEN_LDST_WHOLE_TRANS(vl2re32_v, 2)
-GEN_LDST_WHOLE_TRANS(vl2re64_v, 2)
-GEN_LDST_WHOLE_TRANS(vl4re8_v, 4)
-GEN_LDST_WHOLE_TRANS(vl4re16_v, 4)
-GEN_LDST_WHOLE_TRANS(vl4re32_v, 4)
-GEN_LDST_WHOLE_TRANS(vl4re64_v, 4)
-GEN_LDST_WHOLE_TRANS(vl8re8_v, 8)
-GEN_LDST_WHOLE_TRANS(vl8re16_v, 8)
-GEN_LDST_WHOLE_TRANS(vl8re32_v, 8)
-GEN_LDST_WHOLE_TRANS(vl8re64_v, 8)
+#define GEN_LDST_WHOLE_TRANS(NAME, ETYPE, ARG_NF, IS_LOAD) \
+static bool trans_##NAME(DisasContext *s, arg_##NAME * a) \
+{ \
+ if (require_rvv(s) && \
+ QEMU_IS_ALIGNED(a->rd, ARG_NF)) { \
+ return ldst_whole_trans(a->rd, a->rs1, ARG_NF, ctzl(sizeof(ETYPE)), \
+ gen_helper_##NAME, s, IS_LOAD); \
+ } \
+ return false; \
+}
+
+GEN_LDST_WHOLE_TRANS(vl1re8_v, int8_t, 1, true)
+GEN_LDST_WHOLE_TRANS(vl1re16_v, int16_t, 1, true)
+GEN_LDST_WHOLE_TRANS(vl1re32_v, int32_t, 1, true)
+GEN_LDST_WHOLE_TRANS(vl1re64_v, int64_t, 1, true)
+GEN_LDST_WHOLE_TRANS(vl2re8_v, int8_t, 2, true)
+GEN_LDST_WHOLE_TRANS(vl2re16_v, int16_t, 2, true)
+GEN_LDST_WHOLE_TRANS(vl2re32_v, int32_t, 2, true)
+GEN_LDST_WHOLE_TRANS(vl2re64_v, int64_t, 2, true)
+GEN_LDST_WHOLE_TRANS(vl4re8_v, int8_t, 4, true)
+GEN_LDST_WHOLE_TRANS(vl4re16_v, int16_t, 4, true)
+GEN_LDST_WHOLE_TRANS(vl4re32_v, int32_t, 4, true)
+GEN_LDST_WHOLE_TRANS(vl4re64_v, int64_t, 4, true)
+GEN_LDST_WHOLE_TRANS(vl8re8_v, int8_t, 8, true)
+GEN_LDST_WHOLE_TRANS(vl8re16_v, int16_t, 8, true)
+GEN_LDST_WHOLE_TRANS(vl8re32_v, int32_t, 8, true)
+GEN_LDST_WHOLE_TRANS(vl8re64_v, int64_t, 8, true)
/*
* The vector whole register store instructions are encoded similar to
* unmasked unit-stride store of elements with EEW=8.
*/
-GEN_LDST_WHOLE_TRANS(vs1r_v, 1)
-GEN_LDST_WHOLE_TRANS(vs2r_v, 2)
-GEN_LDST_WHOLE_TRANS(vs4r_v, 4)
-GEN_LDST_WHOLE_TRANS(vs8r_v, 8)
+GEN_LDST_WHOLE_TRANS(vs1r_v, int8_t, 1, false)
+GEN_LDST_WHOLE_TRANS(vs2r_v, int8_t, 2, false)
+GEN_LDST_WHOLE_TRANS(vs4r_v, int8_t, 4, false)
+GEN_LDST_WHOLE_TRANS(vs8r_v, int8_t, 8, false)
/*
*** Vector Integer Arithmetic Instructions
This patch aims at emulating the whole register loads and stores through direct generation of tcg operations rather than through the aid of a helper function. Signed-off-by: Paolo Savini <paolo.savini@embecosm.com> --- target/riscv/insn_trans/trans_rvv.c.inc | 104 +++++++++++++----------- 1 file changed, 56 insertions(+), 48 deletions(-)