diff mbox series

[for,9.0,v15,05/10] target/riscv: always clear vstart for ldst_whole insns

Message ID 20240314175704.478276-6-dbarboza@ventanamicro.com (mailing list archive)
State New, archived
Headers show
Series target/riscv: vector fixes | expand

Commit Message

Daniel Henrique Barboza March 14, 2024, 5:56 p.m. UTC
Commit 8ff8ac6329 added a conditional to guard the vext_ldst_whole()
helper if vstart >= evl. But by skipping the helper we're also not
setting vstart = 0 at the end of the insns, which is incorrect.

We'll move the conditional to vext_ldst_whole(), following in line with
the removal of all brconds vstart >= vl that the next patch will do. The
idea is to make the helpers responsible for their own vstart management.

Fix ldst_whole isns by:

- remove the brcond that skips the helper if vstart is >= evl;

- vext_ldst_whole() now does an early exit with the same check, where
  evl = (vlenb * nf) >> log2_esz, but the early exit will also clear
  vstart.

The 'width' param is now unneeded in ldst_whole_trans() and is also
removed. It was used for the evl calculation for the brcond and has no
other use now.  The 'width' is reflected in vext_ldst_whole() via
log2_esz, which is encoded by GEN_VEXT_LD_WHOLE() as
"ctzl(sizeof(ETYPE))".

Suggested-by: Max Chou <max.chou@sifive.com>
Fixes: 8ff8ac6329 ("target/riscv: rvv: Add missing early exit condition for whole register load/store")
Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
---
 target/riscv/insn_trans/trans_rvv.c.inc | 52 +++++++++++--------------
 target/riscv/vector_helper.c            |  5 +++
 2 files changed, 28 insertions(+), 29 deletions(-)

Comments

Max Chou March 15, 2024, 11:25 a.m. UTC | #1
Reviewed-by: Max Chou <max.chou@sifive.com>

On 2024/3/15 1:56 AM, Daniel Henrique Barboza wrote:
> Commit 8ff8ac6329 added a conditional to guard the vext_ldst_whole()
> helper if vstart >= evl. But by skipping the helper we're also not
> setting vstart = 0 at the end of the insns, which is incorrect.
>
> We'll move the conditional to vext_ldst_whole(), following in line with
> the removal of all brconds vstart >= vl that the next patch will do. The
> idea is to make the helpers responsible for their own vstart management.
>
> Fix ldst_whole isns by:
>
> - remove the brcond that skips the helper if vstart is >= evl;
>
> - vext_ldst_whole() now does an early exit with the same check, where
>    evl = (vlenb * nf) >> log2_esz, but the early exit will also clear
>    vstart.
>
> The 'width' param is now unneeded in ldst_whole_trans() and is also
> removed. It was used for the evl calculation for the brcond and has no
> other use now.  The 'width' is reflected in vext_ldst_whole() via
> log2_esz, which is encoded by GEN_VEXT_LD_WHOLE() as
> "ctzl(sizeof(ETYPE))".
>
> Suggested-by: Max Chou <max.chou@sifive.com>
> Fixes: 8ff8ac6329 ("target/riscv: rvv: Add missing early exit condition for whole register load/store")
> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>
> ---
>   target/riscv/insn_trans/trans_rvv.c.inc | 52 +++++++++++--------------
>   target/riscv/vector_helper.c            |  5 +++
>   2 files changed, 28 insertions(+), 29 deletions(-)
>
> diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc
> index 52c26a7834..1366445e1f 100644
> --- a/target/riscv/insn_trans/trans_rvv.c.inc
> +++ b/target/riscv/insn_trans/trans_rvv.c.inc
> @@ -1097,13 +1097,9 @@ GEN_VEXT_TRANS(vle64ff_v, MO_64, r2nfvm, ldff_op, ld_us_check)
>   typedef void gen_helper_ldst_whole(TCGv_ptr, TCGv, TCGv_env, TCGv_i32);
>   
>   static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
> -                             uint32_t width, gen_helper_ldst_whole *fn,
> +                             gen_helper_ldst_whole *fn,
>                                DisasContext *s)
>   {
> -    uint32_t evl = s->cfg_ptr->vlenb * nf / width;
> -    TCGLabel *over = gen_new_label();
> -    tcg_gen_brcondi_tl(TCG_COND_GEU, cpu_vstart, evl, over);
> -
>       TCGv_ptr dest;
>       TCGv base;
>       TCGv_i32 desc;
> @@ -1120,8 +1116,6 @@ static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
>   
>       fn(dest, base, tcg_env, desc);
>   
> -    gen_set_label(over);
> -
>       return true;
>   }
>   
> @@ -1129,42 +1123,42 @@ static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
>    * load and store whole register instructions ignore vtype and vl setting.
>    * Thus, we don't need to check vill bit. (Section 7.9)
>    */
> -#define GEN_LDST_WHOLE_TRANS(NAME, ARG_NF, WIDTH)               \
> +#define GEN_LDST_WHOLE_TRANS(NAME, ARG_NF)                                \
>   static bool trans_##NAME(DisasContext *s, arg_##NAME * a)                 \
>   {                                                                         \
>       if (require_rvv(s) &&                                                 \
>           QEMU_IS_ALIGNED(a->rd, ARG_NF)) {                                 \
> -        return ldst_whole_trans(a->rd, a->rs1, ARG_NF, WIDTH,             \
> +        return ldst_whole_trans(a->rd, a->rs1, ARG_NF,                    \
>                                   gen_helper_##NAME, s);                    \
>       }                                                                     \
>       return false;                                                         \
>   }
>   
> -GEN_LDST_WHOLE_TRANS(vl1re8_v,  1, 1)
> -GEN_LDST_WHOLE_TRANS(vl1re16_v, 1, 2)
> -GEN_LDST_WHOLE_TRANS(vl1re32_v, 1, 4)
> -GEN_LDST_WHOLE_TRANS(vl1re64_v, 1, 8)
> -GEN_LDST_WHOLE_TRANS(vl2re8_v,  2, 1)
> -GEN_LDST_WHOLE_TRANS(vl2re16_v, 2, 2)
> -GEN_LDST_WHOLE_TRANS(vl2re32_v, 2, 4)
> -GEN_LDST_WHOLE_TRANS(vl2re64_v, 2, 8)
> -GEN_LDST_WHOLE_TRANS(vl4re8_v,  4, 1)
> -GEN_LDST_WHOLE_TRANS(vl4re16_v, 4, 2)
> -GEN_LDST_WHOLE_TRANS(vl4re32_v, 4, 4)
> -GEN_LDST_WHOLE_TRANS(vl4re64_v, 4, 8)
> -GEN_LDST_WHOLE_TRANS(vl8re8_v,  8, 1)
> -GEN_LDST_WHOLE_TRANS(vl8re16_v, 8, 2)
> -GEN_LDST_WHOLE_TRANS(vl8re32_v, 8, 4)
> -GEN_LDST_WHOLE_TRANS(vl8re64_v, 8, 8)
> +GEN_LDST_WHOLE_TRANS(vl1re8_v,  1)
> +GEN_LDST_WHOLE_TRANS(vl1re16_v, 1)
> +GEN_LDST_WHOLE_TRANS(vl1re32_v, 1)
> +GEN_LDST_WHOLE_TRANS(vl1re64_v, 1)
> +GEN_LDST_WHOLE_TRANS(vl2re8_v,  2)
> +GEN_LDST_WHOLE_TRANS(vl2re16_v, 2)
> +GEN_LDST_WHOLE_TRANS(vl2re32_v, 2)
> +GEN_LDST_WHOLE_TRANS(vl2re64_v, 2)
> +GEN_LDST_WHOLE_TRANS(vl4re8_v,  4)
> +GEN_LDST_WHOLE_TRANS(vl4re16_v, 4)
> +GEN_LDST_WHOLE_TRANS(vl4re32_v, 4)
> +GEN_LDST_WHOLE_TRANS(vl4re64_v, 4)
> +GEN_LDST_WHOLE_TRANS(vl8re8_v,  8)
> +GEN_LDST_WHOLE_TRANS(vl8re16_v, 8)
> +GEN_LDST_WHOLE_TRANS(vl8re32_v, 8)
> +GEN_LDST_WHOLE_TRANS(vl8re64_v, 8)
>   
>   /*
>    * The vector whole register store instructions are encoded similar to
>    * unmasked unit-stride store of elements with EEW=8.
>    */
> -GEN_LDST_WHOLE_TRANS(vs1r_v, 1, 1)
> -GEN_LDST_WHOLE_TRANS(vs2r_v, 2, 1)
> -GEN_LDST_WHOLE_TRANS(vs4r_v, 4, 1)
> -GEN_LDST_WHOLE_TRANS(vs8r_v, 8, 1)
> +GEN_LDST_WHOLE_TRANS(vs1r_v, 1)
> +GEN_LDST_WHOLE_TRANS(vs2r_v, 2)
> +GEN_LDST_WHOLE_TRANS(vs4r_v, 4)
> +GEN_LDST_WHOLE_TRANS(vs8r_v, 8)
>   
>   /*
>    *** Vector Integer Arithmetic Instructions
> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
> index bcc553c0e2..1f4c276b21 100644
> --- a/target/riscv/vector_helper.c
> +++ b/target/riscv/vector_helper.c
> @@ -572,6 +572,11 @@ vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
>       uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
>       uint32_t max_elems = vlenb >> log2_esz;
>   
> +    if (env->vstart >= ((vlenb * nf) >> log2_esz)) {
> +        env->vstart = 0;
> +        return;
> +    }
> +
>       k = env->vstart / max_elems;
>       off = env->vstart % max_elems;
>
Alistair Francis March 18, 2024, 9:10 a.m. UTC | #2
On Fri, Mar 15, 2024 at 3:57 AM Daniel Henrique Barboza
<dbarboza@ventanamicro.com> wrote:
>
> Commit 8ff8ac6329 added a conditional to guard the vext_ldst_whole()
> helper if vstart >= evl. But by skipping the helper we're also not
> setting vstart = 0 at the end of the insns, which is incorrect.
>
> We'll move the conditional to vext_ldst_whole(), following in line with
> the removal of all brconds vstart >= vl that the next patch will do. The
> idea is to make the helpers responsible for their own vstart management.
>
> Fix ldst_whole isns by:
>
> - remove the brcond that skips the helper if vstart is >= evl;
>
> - vext_ldst_whole() now does an early exit with the same check, where
>   evl = (vlenb * nf) >> log2_esz, but the early exit will also clear
>   vstart.
>
> The 'width' param is now unneeded in ldst_whole_trans() and is also
> removed. It was used for the evl calculation for the brcond and has no
> other use now.  The 'width' is reflected in vext_ldst_whole() via
> log2_esz, which is encoded by GEN_VEXT_LD_WHOLE() as
> "ctzl(sizeof(ETYPE))".
>
> Suggested-by: Max Chou <max.chou@sifive.com>
> Fixes: 8ff8ac6329 ("target/riscv: rvv: Add missing early exit condition for whole register load/store")
> Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com>

Reviewed-by: Alistair Francis <alistair.francis@wdc.com>

Alistair

> ---
>  target/riscv/insn_trans/trans_rvv.c.inc | 52 +++++++++++--------------
>  target/riscv/vector_helper.c            |  5 +++
>  2 files changed, 28 insertions(+), 29 deletions(-)
>
> diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc
> index 52c26a7834..1366445e1f 100644
> --- a/target/riscv/insn_trans/trans_rvv.c.inc
> +++ b/target/riscv/insn_trans/trans_rvv.c.inc
> @@ -1097,13 +1097,9 @@ GEN_VEXT_TRANS(vle64ff_v, MO_64, r2nfvm, ldff_op, ld_us_check)
>  typedef void gen_helper_ldst_whole(TCGv_ptr, TCGv, TCGv_env, TCGv_i32);
>
>  static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
> -                             uint32_t width, gen_helper_ldst_whole *fn,
> +                             gen_helper_ldst_whole *fn,
>                               DisasContext *s)
>  {
> -    uint32_t evl = s->cfg_ptr->vlenb * nf / width;
> -    TCGLabel *over = gen_new_label();
> -    tcg_gen_brcondi_tl(TCG_COND_GEU, cpu_vstart, evl, over);
> -
>      TCGv_ptr dest;
>      TCGv base;
>      TCGv_i32 desc;
> @@ -1120,8 +1116,6 @@ static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
>
>      fn(dest, base, tcg_env, desc);
>
> -    gen_set_label(over);
> -
>      return true;
>  }
>
> @@ -1129,42 +1123,42 @@ static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
>   * load and store whole register instructions ignore vtype and vl setting.
>   * Thus, we don't need to check vill bit. (Section 7.9)
>   */
> -#define GEN_LDST_WHOLE_TRANS(NAME, ARG_NF, WIDTH)               \
> +#define GEN_LDST_WHOLE_TRANS(NAME, ARG_NF)                                \
>  static bool trans_##NAME(DisasContext *s, arg_##NAME * a)                 \
>  {                                                                         \
>      if (require_rvv(s) &&                                                 \
>          QEMU_IS_ALIGNED(a->rd, ARG_NF)) {                                 \
> -        return ldst_whole_trans(a->rd, a->rs1, ARG_NF, WIDTH,             \
> +        return ldst_whole_trans(a->rd, a->rs1, ARG_NF,                    \
>                                  gen_helper_##NAME, s);                    \
>      }                                                                     \
>      return false;                                                         \
>  }
>
> -GEN_LDST_WHOLE_TRANS(vl1re8_v,  1, 1)
> -GEN_LDST_WHOLE_TRANS(vl1re16_v, 1, 2)
> -GEN_LDST_WHOLE_TRANS(vl1re32_v, 1, 4)
> -GEN_LDST_WHOLE_TRANS(vl1re64_v, 1, 8)
> -GEN_LDST_WHOLE_TRANS(vl2re8_v,  2, 1)
> -GEN_LDST_WHOLE_TRANS(vl2re16_v, 2, 2)
> -GEN_LDST_WHOLE_TRANS(vl2re32_v, 2, 4)
> -GEN_LDST_WHOLE_TRANS(vl2re64_v, 2, 8)
> -GEN_LDST_WHOLE_TRANS(vl4re8_v,  4, 1)
> -GEN_LDST_WHOLE_TRANS(vl4re16_v, 4, 2)
> -GEN_LDST_WHOLE_TRANS(vl4re32_v, 4, 4)
> -GEN_LDST_WHOLE_TRANS(vl4re64_v, 4, 8)
> -GEN_LDST_WHOLE_TRANS(vl8re8_v,  8, 1)
> -GEN_LDST_WHOLE_TRANS(vl8re16_v, 8, 2)
> -GEN_LDST_WHOLE_TRANS(vl8re32_v, 8, 4)
> -GEN_LDST_WHOLE_TRANS(vl8re64_v, 8, 8)
> +GEN_LDST_WHOLE_TRANS(vl1re8_v,  1)
> +GEN_LDST_WHOLE_TRANS(vl1re16_v, 1)
> +GEN_LDST_WHOLE_TRANS(vl1re32_v, 1)
> +GEN_LDST_WHOLE_TRANS(vl1re64_v, 1)
> +GEN_LDST_WHOLE_TRANS(vl2re8_v,  2)
> +GEN_LDST_WHOLE_TRANS(vl2re16_v, 2)
> +GEN_LDST_WHOLE_TRANS(vl2re32_v, 2)
> +GEN_LDST_WHOLE_TRANS(vl2re64_v, 2)
> +GEN_LDST_WHOLE_TRANS(vl4re8_v,  4)
> +GEN_LDST_WHOLE_TRANS(vl4re16_v, 4)
> +GEN_LDST_WHOLE_TRANS(vl4re32_v, 4)
> +GEN_LDST_WHOLE_TRANS(vl4re64_v, 4)
> +GEN_LDST_WHOLE_TRANS(vl8re8_v,  8)
> +GEN_LDST_WHOLE_TRANS(vl8re16_v, 8)
> +GEN_LDST_WHOLE_TRANS(vl8re32_v, 8)
> +GEN_LDST_WHOLE_TRANS(vl8re64_v, 8)
>
>  /*
>   * The vector whole register store instructions are encoded similar to
>   * unmasked unit-stride store of elements with EEW=8.
>   */
> -GEN_LDST_WHOLE_TRANS(vs1r_v, 1, 1)
> -GEN_LDST_WHOLE_TRANS(vs2r_v, 2, 1)
> -GEN_LDST_WHOLE_TRANS(vs4r_v, 4, 1)
> -GEN_LDST_WHOLE_TRANS(vs8r_v, 8, 1)
> +GEN_LDST_WHOLE_TRANS(vs1r_v, 1)
> +GEN_LDST_WHOLE_TRANS(vs2r_v, 2)
> +GEN_LDST_WHOLE_TRANS(vs4r_v, 4)
> +GEN_LDST_WHOLE_TRANS(vs8r_v, 8)
>
>  /*
>   *** Vector Integer Arithmetic Instructions
> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
> index bcc553c0e2..1f4c276b21 100644
> --- a/target/riscv/vector_helper.c
> +++ b/target/riscv/vector_helper.c
> @@ -572,6 +572,11 @@ vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
>      uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
>      uint32_t max_elems = vlenb >> log2_esz;
>
> +    if (env->vstart >= ((vlenb * nf) >> log2_esz)) {
> +        env->vstart = 0;
> +        return;
> +    }
> +
>      k = env->vstart / max_elems;
>      off = env->vstart % max_elems;
>
> --
> 2.44.0
>
>
diff mbox series

Patch

diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc
index 52c26a7834..1366445e1f 100644
--- a/target/riscv/insn_trans/trans_rvv.c.inc
+++ b/target/riscv/insn_trans/trans_rvv.c.inc
@@ -1097,13 +1097,9 @@  GEN_VEXT_TRANS(vle64ff_v, MO_64, r2nfvm, ldff_op, ld_us_check)
 typedef void gen_helper_ldst_whole(TCGv_ptr, TCGv, TCGv_env, TCGv_i32);
 
 static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
-                             uint32_t width, gen_helper_ldst_whole *fn,
+                             gen_helper_ldst_whole *fn,
                              DisasContext *s)
 {
-    uint32_t evl = s->cfg_ptr->vlenb * nf / width;
-    TCGLabel *over = gen_new_label();
-    tcg_gen_brcondi_tl(TCG_COND_GEU, cpu_vstart, evl, over);
-
     TCGv_ptr dest;
     TCGv base;
     TCGv_i32 desc;
@@ -1120,8 +1116,6 @@  static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
 
     fn(dest, base, tcg_env, desc);
 
-    gen_set_label(over);
-
     return true;
 }
 
@@ -1129,42 +1123,42 @@  static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
  * load and store whole register instructions ignore vtype and vl setting.
  * Thus, we don't need to check vill bit. (Section 7.9)
  */
-#define GEN_LDST_WHOLE_TRANS(NAME, ARG_NF, WIDTH)               \
+#define GEN_LDST_WHOLE_TRANS(NAME, ARG_NF)                                \
 static bool trans_##NAME(DisasContext *s, arg_##NAME * a)                 \
 {                                                                         \
     if (require_rvv(s) &&                                                 \
         QEMU_IS_ALIGNED(a->rd, ARG_NF)) {                                 \
-        return ldst_whole_trans(a->rd, a->rs1, ARG_NF, WIDTH,             \
+        return ldst_whole_trans(a->rd, a->rs1, ARG_NF,                    \
                                 gen_helper_##NAME, s);                    \
     }                                                                     \
     return false;                                                         \
 }
 
-GEN_LDST_WHOLE_TRANS(vl1re8_v,  1, 1)
-GEN_LDST_WHOLE_TRANS(vl1re16_v, 1, 2)
-GEN_LDST_WHOLE_TRANS(vl1re32_v, 1, 4)
-GEN_LDST_WHOLE_TRANS(vl1re64_v, 1, 8)
-GEN_LDST_WHOLE_TRANS(vl2re8_v,  2, 1)
-GEN_LDST_WHOLE_TRANS(vl2re16_v, 2, 2)
-GEN_LDST_WHOLE_TRANS(vl2re32_v, 2, 4)
-GEN_LDST_WHOLE_TRANS(vl2re64_v, 2, 8)
-GEN_LDST_WHOLE_TRANS(vl4re8_v,  4, 1)
-GEN_LDST_WHOLE_TRANS(vl4re16_v, 4, 2)
-GEN_LDST_WHOLE_TRANS(vl4re32_v, 4, 4)
-GEN_LDST_WHOLE_TRANS(vl4re64_v, 4, 8)
-GEN_LDST_WHOLE_TRANS(vl8re8_v,  8, 1)
-GEN_LDST_WHOLE_TRANS(vl8re16_v, 8, 2)
-GEN_LDST_WHOLE_TRANS(vl8re32_v, 8, 4)
-GEN_LDST_WHOLE_TRANS(vl8re64_v, 8, 8)
+GEN_LDST_WHOLE_TRANS(vl1re8_v,  1)
+GEN_LDST_WHOLE_TRANS(vl1re16_v, 1)
+GEN_LDST_WHOLE_TRANS(vl1re32_v, 1)
+GEN_LDST_WHOLE_TRANS(vl1re64_v, 1)
+GEN_LDST_WHOLE_TRANS(vl2re8_v,  2)
+GEN_LDST_WHOLE_TRANS(vl2re16_v, 2)
+GEN_LDST_WHOLE_TRANS(vl2re32_v, 2)
+GEN_LDST_WHOLE_TRANS(vl2re64_v, 2)
+GEN_LDST_WHOLE_TRANS(vl4re8_v,  4)
+GEN_LDST_WHOLE_TRANS(vl4re16_v, 4)
+GEN_LDST_WHOLE_TRANS(vl4re32_v, 4)
+GEN_LDST_WHOLE_TRANS(vl4re64_v, 4)
+GEN_LDST_WHOLE_TRANS(vl8re8_v,  8)
+GEN_LDST_WHOLE_TRANS(vl8re16_v, 8)
+GEN_LDST_WHOLE_TRANS(vl8re32_v, 8)
+GEN_LDST_WHOLE_TRANS(vl8re64_v, 8)
 
 /*
  * The vector whole register store instructions are encoded similar to
  * unmasked unit-stride store of elements with EEW=8.
  */
-GEN_LDST_WHOLE_TRANS(vs1r_v, 1, 1)
-GEN_LDST_WHOLE_TRANS(vs2r_v, 2, 1)
-GEN_LDST_WHOLE_TRANS(vs4r_v, 4, 1)
-GEN_LDST_WHOLE_TRANS(vs8r_v, 8, 1)
+GEN_LDST_WHOLE_TRANS(vs1r_v, 1)
+GEN_LDST_WHOLE_TRANS(vs2r_v, 2)
+GEN_LDST_WHOLE_TRANS(vs4r_v, 4)
+GEN_LDST_WHOLE_TRANS(vs8r_v, 8)
 
 /*
  *** Vector Integer Arithmetic Instructions
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index bcc553c0e2..1f4c276b21 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -572,6 +572,11 @@  vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
     uint32_t vlenb = riscv_cpu_cfg(env)->vlenb;
     uint32_t max_elems = vlenb >> log2_esz;
 
+    if (env->vstart >= ((vlenb * nf) >> log2_esz)) {
+        env->vstart = 0;
+        return;
+    }
+
     k = env->vstart / max_elems;
     off = env->vstart % max_elems;