@@ -1100,25 +1100,56 @@ GEN_VEXT_TRANS(vle64ff_v, MO_64, r2nfvm, ldff_op, ld_us_check)
typedef void gen_helper_ldst_whole(TCGv_ptr, TCGv, TCGv_env, TCGv_i32);
static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
- gen_helper_ldst_whole *fn,
- DisasContext *s)
+ uint32_t log2_esz, gen_helper_ldst_whole *fn,
+ DisasContext *s, bool is_load)
{
- TCGv_ptr dest;
- TCGv base;
- TCGv_i32 desc;
-
- uint32_t data = FIELD_DP32(0, VDATA, NF, nf);
- data = FIELD_DP32(data, VDATA, VM, 1);
- dest = tcg_temp_new_ptr();
- desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
- s->cfg_ptr->vlenb, data));
-
- base = get_gpr(s, rs1, EXT_NONE);
- tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
-
mark_vs_dirty(s);
- fn(dest, base, tcg_env, desc);
+ /*
+ * Load/store minimum vlenb bytes per iteration.
+ * When possible do this atomically.
+ * Update vstart with the number of processed elements.
+ */
+ if (s->vstart_eq_zero) {
+ TCGv addr = tcg_temp_new();
+ uint32_t size = s->cfg_ptr->vlenb * nf;
+ TCGv_i128 t16 = tcg_temp_new_i128();
+ MemOp atomicity = MO_ATOM_NONE;
+ if (log2_esz == 0) {
+ atomicity = MO_ATOM_NONE;
+ } else {
+ atomicity = MO_ATOM_IFALIGN_PAIR;
+ }
+ for (int i = 0; i < size; i += 16) {
+ addr = get_address(s, rs1, i);
+ if (is_load) {
+ tcg_gen_qemu_ld_i128(t16, addr, s->mem_idx,
+ MO_LE | MO_128 | atomicity);
+ tcg_gen_st_i128(t16, tcg_env, vreg_ofs(s, vd) + i);
+ } else {
+ tcg_gen_ld_i128(t16, tcg_env, vreg_ofs(s, vd) + i);
+ tcg_gen_qemu_st_i128(t16, addr, s->mem_idx,
+ MO_LE | MO_128 | atomicity);
+ }
+ if (i == size - 16) {
+ tcg_gen_movi_tl(cpu_vstart, 0);
+ } else {
+ tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 16 >> log2_esz);
+ }
+ }
+ } else {
+ TCGv_ptr dest;
+ TCGv base;
+ TCGv_i32 desc;
+ uint32_t data = FIELD_DP32(0, VDATA, NF, nf);
+ data = FIELD_DP32(data, VDATA, VM, 1);
+ dest = tcg_temp_new_ptr();
+ desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
+ s->cfg_ptr->vlenb, data));
+ base = get_gpr(s, rs1, EXT_NONE);
+ tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
+ fn(dest, base, tcg_env, desc);
+ }
finalize_rvv_inst(s);
return true;
@@ -1128,42 +1159,42 @@ static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
* load and store whole register instructions ignore vtype and vl setting.
* Thus, we don't need to check vill bit. (Section 7.9)
*/
-#define GEN_LDST_WHOLE_TRANS(NAME, ARG_NF) \
-static bool trans_##NAME(DisasContext *s, arg_##NAME * a) \
-{ \
- if (require_rvv(s) && \
- QEMU_IS_ALIGNED(a->rd, ARG_NF)) { \
- return ldst_whole_trans(a->rd, a->rs1, ARG_NF, \
- gen_helper_##NAME, s); \
- } \
- return false; \
-}
-
-GEN_LDST_WHOLE_TRANS(vl1re8_v, 1)
-GEN_LDST_WHOLE_TRANS(vl1re16_v, 1)
-GEN_LDST_WHOLE_TRANS(vl1re32_v, 1)
-GEN_LDST_WHOLE_TRANS(vl1re64_v, 1)
-GEN_LDST_WHOLE_TRANS(vl2re8_v, 2)
-GEN_LDST_WHOLE_TRANS(vl2re16_v, 2)
-GEN_LDST_WHOLE_TRANS(vl2re32_v, 2)
-GEN_LDST_WHOLE_TRANS(vl2re64_v, 2)
-GEN_LDST_WHOLE_TRANS(vl4re8_v, 4)
-GEN_LDST_WHOLE_TRANS(vl4re16_v, 4)
-GEN_LDST_WHOLE_TRANS(vl4re32_v, 4)
-GEN_LDST_WHOLE_TRANS(vl4re64_v, 4)
-GEN_LDST_WHOLE_TRANS(vl8re8_v, 8)
-GEN_LDST_WHOLE_TRANS(vl8re16_v, 8)
-GEN_LDST_WHOLE_TRANS(vl8re32_v, 8)
-GEN_LDST_WHOLE_TRANS(vl8re64_v, 8)
+#define GEN_LDST_WHOLE_TRANS(NAME, ETYPE, ARG_NF, IS_LOAD) \
+static bool trans_##NAME(DisasContext *s, arg_##NAME * a) \
+{ \
+ if (require_rvv(s) && \
+ QEMU_IS_ALIGNED(a->rd, ARG_NF)) { \
+ return ldst_whole_trans(a->rd, a->rs1, ARG_NF, ctzl(sizeof(ETYPE)), \
+ gen_helper_##NAME, s, IS_LOAD); \
+ } \
+ return false; \
+}
+
+GEN_LDST_WHOLE_TRANS(vl1re8_v, int8_t, 1, true)
+GEN_LDST_WHOLE_TRANS(vl1re16_v, int16_t, 1, true)
+GEN_LDST_WHOLE_TRANS(vl1re32_v, int32_t, 1, true)
+GEN_LDST_WHOLE_TRANS(vl1re64_v, int64_t, 1, true)
+GEN_LDST_WHOLE_TRANS(vl2re8_v, int8_t, 2, true)
+GEN_LDST_WHOLE_TRANS(vl2re16_v, int16_t, 2, true)
+GEN_LDST_WHOLE_TRANS(vl2re32_v, int32_t, 2, true)
+GEN_LDST_WHOLE_TRANS(vl2re64_v, int64_t, 2, true)
+GEN_LDST_WHOLE_TRANS(vl4re8_v, int8_t, 4, true)
+GEN_LDST_WHOLE_TRANS(vl4re16_v, int16_t, 4, true)
+GEN_LDST_WHOLE_TRANS(vl4re32_v, int32_t, 4, true)
+GEN_LDST_WHOLE_TRANS(vl4re64_v, int64_t, 4, true)
+GEN_LDST_WHOLE_TRANS(vl8re8_v, int8_t, 8, true)
+GEN_LDST_WHOLE_TRANS(vl8re16_v, int16_t, 8, true)
+GEN_LDST_WHOLE_TRANS(vl8re32_v, int32_t, 8, true)
+GEN_LDST_WHOLE_TRANS(vl8re64_v, int64_t, 8, true)
/*
* The vector whole register store instructions are encoded similar to
* unmasked unit-stride store of elements with EEW=8.
*/
-GEN_LDST_WHOLE_TRANS(vs1r_v, 1)
-GEN_LDST_WHOLE_TRANS(vs2r_v, 2)
-GEN_LDST_WHOLE_TRANS(vs4r_v, 4)
-GEN_LDST_WHOLE_TRANS(vs8r_v, 8)
+GEN_LDST_WHOLE_TRANS(vs1r_v, int8_t, 1, false)
+GEN_LDST_WHOLE_TRANS(vs2r_v, int8_t, 2, false)
+GEN_LDST_WHOLE_TRANS(vs4r_v, int8_t, 4, false)
+GEN_LDST_WHOLE_TRANS(vs8r_v, int8_t, 8, false)
/*
*** Vector Integer Arithmetic Instructions
This patch replaces the use of a helper function with direct tcg ops generation in order to emulate whole register loads and stores. This is done in order to improve the performance of QEMU. We still use the helper function when vstart is not 0 at the beginning of the emulation of the whole register load or store. Signed-off-by: Paolo Savini <paolo.savini@embecosm.com> --- target/riscv/insn_trans/trans_rvv.c.inc | 125 +++++++++++++++--------- 1 file changed, 78 insertions(+), 47 deletions(-)