diff mbox series

[v5,24/60] target/riscv: vector single-width averaging add and subtract

Message ID 20200312145900.2054-25-zhiwei_liu@c-sky.com (mailing list archive)
State New, archived
Headers show
Series target/riscv: support vector extension v0.7.1 | expand

Commit Message

LIU Zhiwei March 12, 2020, 2:58 p.m. UTC
Signed-off-by: LIU Zhiwei <zhiwei_liu@c-sky.com>
---
 target/riscv/helper.h                   |  17 ++++
 target/riscv/insn32.decode              |   5 +
 target/riscv/insn_trans/trans_rvv.inc.c |   7 ++
 target/riscv/vector_helper.c            | 129 ++++++++++++++++++++++++
 4 files changed, 158 insertions(+)

Comments

Richard Henderson March 14, 2020, 8:14 a.m. UTC | #1
On 3/12/20 7:58 AM, LIU Zhiwei wrote:
> +/* Vector Single-Width Averaging Add and Subtract */
> +static inline uint8_t get_round(CPURISCVState *env, uint64_t v, uint8_t shift)
> +{
> +    uint8_t d = extract64(v, shift, 1);
> +    uint8_t d1;
> +    uint64_t D1, D2;
> +    int mod = env->vxrm;
> +
> +    if (shift == 0 || shift > 64) {
> +        return 0;
> +    }
> +
> +    d1 = extract64(v, shift - 1, 1);
> +    D1 = extract64(v, 0, shift);
> +    if (mod == 0) { /* round-to-nearest-up (add +0.5 LSB) */
> +        return d1;
> +    } else if (mod == 1) { /* round-to-nearest-even */
> +        if (shift > 1) {
> +            D2 = extract64(v, 0, shift - 1);
> +            return d1 & ((D2 != 0) | d);
> +        } else {
> +            return d1 & d;
> +        }
> +    } else if (mod == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
> +        return !d & (D1 != 0);
> +    }
> +    return 0; /* round-down (truncate) */
> +}
> +
> +static inline int8_t aadd8(CPURISCVState *env, int8_t a, int8_t b)
> +{
> +    int16_t res = (int16_t)a + (int16_t)b;
> +    uint8_t round = get_round(env, res, 1);
> +    res   = (res >> 1) + round;
> +    return res;
> +}

I think this is a suboptimal way to arrange things.  It leaves the vxrm lookup
inside of the main loop, while it is obviously loop invariant.

I think you should have 4 versions of aadd8, for each of the rounding modes,

> +RVVCALL(OPIVV2_ENV, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd8)

then use this, or something like it, to define 4 functions containing main
loops, which will get the helper above inlined.

Then use a final outermost wrapper to select one of the 4 functions based on
env->vxrm.


r~
Richard Henderson March 14, 2020, 8:25 a.m. UTC | #2
On 3/14/20 1:14 AM, Richard Henderson wrote:
> I think you should have 4 versions of aadd8, for each of the rounding modes,
> 
>> +RVVCALL(OPIVV2_ENV, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd8)
> 
> then use this, or something like it, to define 4 functions containing main
> loops, which will get the helper above inlined.

Alternately, a set of inlines, where a (constant) vxrm is passed down from above.

> Then use a final outermost wrapper to select one of the 4 functions based on
> env->vxrm.

The outermost wrapper could look like

    switch (env->vxrm) {
    case 0:  somefunc(some, args, 0); break;
    case 1:  somefunc(some, args, 1); break;
    case 2:  somefunc(some, args, 2); break;
    default: somefunc(some, args, 3); break;
    }

so that somefunc (and its subroutines) are expanded with a constant, and we
switch on that constant at the outermost level.


r~
LIU Zhiwei March 14, 2020, 11:12 p.m. UTC | #3
On 2020/3/14 16:25, Richard Henderson wrote:
> On 3/14/20 1:14 AM, Richard Henderson wrote:
>> I think you should have 4 versions of aadd8, for each of the rounding modes,
>>
>>> +RVVCALL(OPIVV2_ENV, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd8)
>> then use this, or something like it, to define 4 functions containing main
>> loops, which will get the helper above inlined.
> Alternately, a set of inlines, where a (constant) vxrm is passed down from above.

I am not sure whether I get it. In my opinion, the code should be modified like

static inline int8_t aadd8_rnu(CPURISCVState *env, int8_t a, int8_t b)
{
     int16_t res = (int16_t)a + (int16_t)b;
     uint8_t round = res & 0x1;
     res   = (res >> 1) + round;
     return res;
}

static inline int8_t aadd8_rne(CPURISCVState *env, int8_t a, int8_t b)
{
     int16_t res = (int16_t)a + (int16_t)b;
     uint8_t round = ((res & 0x3) == 0x3);
     res   = (res >> 1) + round;
     return res;
}

static inline int8_t aadd8_rdn(CPURISCVState *env, int8_t a, int8_t b)
{
     int16_t res = (int16_t)a + (int16_t)b;
     res   = (res >> 1);
     return res;
}

static inline int8_t aadd8_rod(CPURISCVState *env, int8_t a, int8_t b)
{
     int16_t res = (int16_t)a + (int16_t)b;
     uint8_t round = ((res & 0x3) == 0x1);
    res   = (res >> 1) + round;
     return res;
}

RVVCALL(OPIVV2_ENV, vaadd_vv_b_rnu, OP_SSS_B, H1, H1, H1, aadd8_rnu)
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rne, OP_SSS_B, H1, H1, H1, aadd8_rne)
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rdn, OP_SSS_B, H1, H1, H1, aadd8_rdn)
RVVCALL(OPIVV2_ENV, vaadd_vv_b_rod, OP_SSS_B, H1, H1, H1, aadd8_rod)

void do_vext_vv_env(void *vd, void *v0, void *vs1,
                     void *vs2, CPURISCVState *env, uint32_t desc,
                     uint32_t esz, uint32_t dsz,
                     opivv2_fn *fn, clear_fn *clearfn)
{
     uint32_t vlmax = vext_maxsz(desc) / esz;
     uint32_t mlen = vext_mlen(desc);
     uint32_t vm = vext_vm(desc);
     uint32_t vl = env->vl;
     uint32_t i;
     for (i = 0; i < vl; i++) {
         if (!vm && !vext_elem_mask(v0, mlen, i)) {
             continue;
         }
         fn(vd, vs1, vs2, i, env);
     }
     if (i != 0) {
         clear_fn(vd, vl, vl * dsz,  vlmax * dsz);
     }
}

#define GEN_VEXT_VV_ENV(NAME, ESZ, DSZ, CLEAR_FN)         \
void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
                   void *vs2, CPURISCVState *env,          \
                   uint32_t desc)                          \
{                                                         \
     static opivv2_fn *fns[4] = {                          \
         NAME##_rnu, NAME##_rne,                           \
         NAME##_rdn, NAME##_rod                            \
     }                                                     \
     return do_vext_vv_env(vd, v0, vs1, vs2, env, desc,    \
                           ESZ, DSZ, fns[env->vxrm],       \
			  CLEAR_FN);                      \
}

Is it true?

Zhiwei

>> Then use a final outermost wrapper to select one of the 4 functions based on
>> env->vxrm.
> The outermost wrapper could look like
>
>      switch (env->vxrm) {
>      case 0:  somefunc(some, args, 0); break;
>      case 1:  somefunc(some, args, 1); break;
>      case 2:  somefunc(some, args, 2); break;
>      default: somefunc(some, args, 3); break;
>      }
>
> so that somefunc (and its subroutines) are expanded with a constant, and we
> switch on that constant at the outermost level.
>
>
> r~
Richard Henderson March 15, 2020, 1 a.m. UTC | #4
On 3/14/20 4:12 PM, LIU Zhiwei wrote:
> I am not sure whether I get it. In my opinion, the code should be modified like
> 
> static inline int8_t aadd8_rnu(CPURISCVState *env, int8_t a, int8_t b)
> {
>     int16_t res = (int16_t)a + (int16_t)b;
>     uint8_t round = res & 0x1;
>     res   = (res >> 1) + round;
>     return res;
> }
> 
> static inline int8_t aadd8_rne(CPURISCVState *env, int8_t a, int8_t b)
> {
>     int16_t res = (int16_t)a + (int16_t)b;
>     uint8_t round = ((res & 0x3) == 0x3);
>     res   = (res >> 1) + round;
>     return res;
> }
> 
> static inline int8_t aadd8_rdn(CPURISCVState *env, int8_t a, int8_t b)
> {
>     int16_t res = (int16_t)a + (int16_t)b;
>     res   = (res >> 1);
>     return res;
> }
> 
> static inline int8_t aadd8_rod(CPURISCVState *env, int8_t a, int8_t b)
> {
>     int16_t res = (int16_t)a + (int16_t)b;
>     uint8_t round = ((res & 0x3) == 0x1);
>    res   = (res >> 1) + round;
>     return res;
> }
> 
> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rnu, OP_SSS_B, H1, H1, H1, aadd8_rnu)
> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rne, OP_SSS_B, H1, H1, H1, aadd8_rne)
> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rdn, OP_SSS_B, H1, H1, H1, aadd8_rdn)
> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rod, OP_SSS_B, H1, H1, H1, aadd8_rod)
> 
> void do_vext_vv_env(void *vd, void *v0, void *vs1,
>                     void *vs2, CPURISCVState *env, uint32_t desc,
>                     uint32_t esz, uint32_t dsz,
>                     opivv2_fn *fn, clear_fn *clearfn)
> {
>     uint32_t vlmax = vext_maxsz(desc) / esz;
>     uint32_t mlen = vext_mlen(desc);
>     uint32_t vm = vext_vm(desc);
>     uint32_t vl = env->vl;
>     uint32_t i;
>     for (i = 0; i < vl; i++) {
>         if (!vm && !vext_elem_mask(v0, mlen, i)) {
>             continue;
>         }
>         fn(vd, vs1, vs2, i, env);
>     }
>     if (i != 0) {
>         clear_fn(vd, vl, vl * dsz,  vlmax * dsz);
>     }
> }
> 
> #define GEN_VEXT_VV_ENV(NAME, ESZ, DSZ, CLEAR_FN)         \
> void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
>                   void *vs2, CPURISCVState *env,          \
>                   uint32_t desc)                          \
> {                                                         \
>     static opivv2_fn *fns[4] = {                          \
>         NAME##_rnu, NAME##_rne,                           \
>         NAME##_rdn, NAME##_rod                            \
>     }                                                     \
>     return do_vext_vv_env(vd, v0, vs1, vs2, env, desc,    \
>                           ESZ, DSZ, fns[env->vxrm],       \
>               CLEAR_FN);                      \
> }
> 
> Is it true?

While that does look good for this case, there are many other uses of
get_round(), and it may not be quite as simple there.

My suggestion was

static inline int32_t aadd32(int vxrm, int32_t a, int32_t b)
{
    int64_t res = (int64_t)a + b;
    uint8_t round = get_round(vxrm, res, 1);

    return (res >> 1) + round;
}

static inline int64_t aadd64(int vxrm, int64_t a, int64_t b)
{
    int64_t res = a + b;
    uint8_t round = get_round(vxrm, res, 1);
    int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;

    /* With signed overflow, bit 64 is inverse of bit 63. */
    return ((res >> 1) ^ over) + round;
}

RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)

static inline void
vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
             uint32_t vl, uint32_t vm, uint32_t mlen, int vxrm,
             opivv2_rm_fn *fn)
{
    for (uint32_t i = 0; i < vl; i++) {
        if (!vm && !vext_elem_mask(v0, mlen, i)) {
            continue;
        }
        fn(vd, vs1, vs2, i, vxrm);
    }
}

static inline void
vext_vv_rm_2(void *vd, void *v0, void *vs1,
             void *vs2, CPURISCVState *env, uint32_t desc,
             uint32_t esz, uint32_t dsz,
             opivv2_rm_fn *fn, clear_fn *clearfn)
{
    uint32_t vlmax = vext_maxsz(desc) / esz;
    uint32_t mlen = vext_mlen(desc);
    uint32_t vm = vext_vm(desc);
    uint32_t vl = env->vl;

    if (vl == 0) {
        return;
    }

    switch (env->vxrm) {
    case 0: /* rnu */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 0, fn);
        break;
    case 1: /* rne */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 1, fn);
        break;
    case 2: /* rdn */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 2, fn);
        break;
    default: /* rod */
        vext_vv_rm_1(vd, v0, vs1, vs2,
                     vl, vm, mlen, 3, fn);
        break;
    }

    clear_fn(vd, vl, vl * dsz,  vlmax * dsz);
}

>From vext_vv_rm_2, a constant is passed down all of the inline functions, so
that a constant arrives in get_round() at the bottom of the call chain.  At
which point all of the expressions get folded by the compiler and we *should*
get very similar generated code as to what you have above.


r~
LIU Zhiwei March 15, 2020, 11:23 p.m. UTC | #5
On 2020/3/15 9:00, Richard Henderson wrote:
> On 3/14/20 4:12 PM, LIU Zhiwei wrote:
>> I am not sure whether I get it. In my opinion, the code should be modified like
>>
>> static inline int8_t aadd8_rnu(CPURISCVState *env, int8_t a, int8_t b)
>> {
>>      int16_t res = (int16_t)a + (int16_t)b;
>>      uint8_t round = res & 0x1;
>>      res   = (res >> 1) + round;
>>      return res;
>> }
>>
>> static inline int8_t aadd8_rne(CPURISCVState *env, int8_t a, int8_t b)
>> {
>>      int16_t res = (int16_t)a + (int16_t)b;
>>      uint8_t round = ((res & 0x3) == 0x3);
>>      res   = (res >> 1) + round;
>>      return res;
>> }
>>
>> static inline int8_t aadd8_rdn(CPURISCVState *env, int8_t a, int8_t b)
>> {
>>      int16_t res = (int16_t)a + (int16_t)b;
>>      res   = (res >> 1);
>>      return res;
>> }
>>
>> static inline int8_t aadd8_rod(CPURISCVState *env, int8_t a, int8_t b)
>> {
>>      int16_t res = (int16_t)a + (int16_t)b;
>>      uint8_t round = ((res & 0x3) == 0x1);
>>     res   = (res >> 1) + round;
>>      return res;
>> }
>>
>> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rnu, OP_SSS_B, H1, H1, H1, aadd8_rnu)
>> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rne, OP_SSS_B, H1, H1, H1, aadd8_rne)
>> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rdn, OP_SSS_B, H1, H1, H1, aadd8_rdn)
>> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rod, OP_SSS_B, H1, H1, H1, aadd8_rod)
>>
>> void do_vext_vv_env(void *vd, void *v0, void *vs1,
>>                      void *vs2, CPURISCVState *env, uint32_t desc,
>>                      uint32_t esz, uint32_t dsz,
>>                      opivv2_fn *fn, clear_fn *clearfn)
>> {
>>      uint32_t vlmax = vext_maxsz(desc) / esz;
>>      uint32_t mlen = vext_mlen(desc);
>>      uint32_t vm = vext_vm(desc);
>>      uint32_t vl = env->vl;
>>      uint32_t i;
>>      for (i = 0; i < vl; i++) {
>>          if (!vm && !vext_elem_mask(v0, mlen, i)) {
>>              continue;
>>          }
>>          fn(vd, vs1, vs2, i, env);
>>      }
>>      if (i != 0) {
>>          clear_fn(vd, vl, vl * dsz,  vlmax * dsz);
>>      }
>> }
>>
>> #define GEN_VEXT_VV_ENV(NAME, ESZ, DSZ, CLEAR_FN)         \
>> void HELPER(NAME)(void *vd, void *v0, void *vs1,          \
>>                    void *vs2, CPURISCVState *env,          \
>>                    uint32_t desc)                          \
>> {                                                         \
>>      static opivv2_fn *fns[4] = {                          \
>>          NAME##_rnu, NAME##_rne,                           \
>>          NAME##_rdn, NAME##_rod                            \
>>      }                                                     \
>>      return do_vext_vv_env(vd, v0, vs1, vs2, env, desc,    \
>>                            ESZ, DSZ, fns[env->vxrm],       \
>>                CLEAR_FN);                      \
>> }
>>
>> Is it true?
> While that does look good for this case, there are many other uses of
> get_round(), and it may not be quite as simple there.
>
> My suggestion was
>
> static inline int32_t aadd32(int vxrm, int32_t a, int32_t b)
> {
>      int64_t res = (int64_t)a + b;
>      uint8_t round = get_round(vxrm, res, 1);
>
>      return (res >> 1) + round;
> }
>
> static inline int64_t aadd64(int vxrm, int64_t a, int64_t b)
> {
>      int64_t res = a + b;
>      uint8_t round = get_round(vxrm, res, 1);
>      int64_t over = (res ^ a) & (res ^ b) & INT64_MIN;
>
>      /* With signed overflow, bit 64 is inverse of bit 63. */
>      return ((res >> 1) ^ over) + round;
> }
>
> RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32)
> RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32)
> RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
> RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
>
> static inline void
> vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2,
>               uint32_t vl, uint32_t vm, uint32_t mlen, int vxrm,
>               opivv2_rm_fn *fn)
> {
>      for (uint32_t i = 0; i < vl; i++) {
>          if (!vm && !vext_elem_mask(v0, mlen, i)) {
>              continue;
>          }
>          fn(vd, vs1, vs2, i, vxrm);
>      }
> }
>
> static inline void
> vext_vv_rm_2(void *vd, void *v0, void *vs1,
>               void *vs2, CPURISCVState *env, uint32_t desc,
>               uint32_t esz, uint32_t dsz,
>               opivv2_rm_fn *fn, clear_fn *clearfn)
> {
>      uint32_t vlmax = vext_maxsz(desc) / esz;
>      uint32_t mlen = vext_mlen(desc);
>      uint32_t vm = vext_vm(desc);
>      uint32_t vl = env->vl;
>
>      if (vl == 0) {
>          return;
>      }
>
>      switch (env->vxrm) {
>      case 0: /* rnu */
>          vext_vv_rm_1(vd, v0, vs1, vs2,
>                       vl, vm, mlen, 0, fn);
>          break;
>      case 1: /* rne */
>          vext_vv_rm_1(vd, v0, vs1, vs2,
>                       vl, vm, mlen, 1, fn);
>          break;
>      case 2: /* rdn */
>          vext_vv_rm_1(vd, v0, vs1, vs2,
>                       vl, vm, mlen, 2, fn);
>          break;
>      default: /* rod */
>          vext_vv_rm_1(vd, v0, vs1, vs2,
>                       vl, vm, mlen, 3, fn);
>          break;
>      }
>
>      clear_fn(vd, vl, vl * dsz,  vlmax * dsz);
> }
>
> >From vext_vv_rm_2, a constant is passed down all of the inline functions, so
> that a constant arrives in get_round() at the bottom of the call chain.  At
> which point all of the expressions get folded by the compiler and we *should*
> get very similar generated code as to what you have above.
Yes, it will be much better.

I still have one question here.

Many other fixed point instructions also need vxsat besides vxsrm.

In that cases, can I just define OPIVV2_RM like this:

#define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
static inline void                                                  \
do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
           CPURISCVState *env, int vxrm)                             \
{                                                                   \
     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
}

static inline int32_t aadd32(|__attribute__((unused)) |CPURISCVState *env,
			     int vxrm, int32_t a, int32_t b)
{
     int64_t res = (int64_t)a + b;
     uint8_t round = get_round(vxrm, res, 1);

     return (res >> 1) + round;
}


In this way, I can write just one OPIVV2_RM instead of (OPIVV2_RM, 
OPIVV2_RM_ENV, OPIVV2_ENV).

Zhiwei

>
> r~
Richard Henderson March 15, 2020, 11:27 p.m. UTC | #6
On 3/15/20 4:23 PM, LIU Zhiwei wrote:
> Many other fixed point instructions also need vxsat besides vxsrm.

Ah yes.

> In that cases, can I just define OPIVV2_RM like this:
> 
> #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP)     \
> static inline void                                                  \
> do_##NAME(void *vd, void *vs1, void *vs2, int i,                    \
>           CPURISCVState *env, int vxrm)                             \
> {                                                                   \
>     TX1 s1 = *((T1 *)vs1 + HS1(i));                                 \
>     TX2 s2 = *((T2 *)vs2 + HS2(i));                                 \
>     *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1);                    \
> }
> 
> static inline int32_t aadd32(|__attribute__((unused)) |CPURISCVState *env, 
> 			     int vxrm, int32_t a, int32_t b)

You can drop the unused.  We don't turn on warnings for unused arguments, as we
have a *lot* of them for exactly this reason -- keeping a common functional
interface.


> {
>     int64_t res = (int64_t)a + b;
>     uint8_t round = get_round(vxrm, res, 1);
> 
>     return (res >> 1) + round;
> }
> 
> 
> In this way, I can write just one OPIVV2_RM instead of (OPIVV2_RM,
> OPIVV2_RM_ENV, OPIVV2_ENV).

Yes, that's fine.


r~
diff mbox series

Patch

diff --git a/target/riscv/helper.h b/target/riscv/helper.h
index 95da00d365..d3837d2ca4 100644
--- a/target/riscv/helper.h
+++ b/target/riscv/helper.h
@@ -707,3 +707,20 @@  DEF_HELPER_6(vssub_vx_b, void, ptr, ptr, tl, ptr, env, i32)
 DEF_HELPER_6(vssub_vx_h, void, ptr, ptr, tl, ptr, env, i32)
 DEF_HELPER_6(vssub_vx_w, void, ptr, ptr, tl, ptr, env, i32)
 DEF_HELPER_6(vssub_vx_d, void, ptr, ptr, tl, ptr, env, i32)
+
+DEF_HELPER_6(vaadd_vv_b, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vaadd_vv_h, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vaadd_vv_w, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vaadd_vv_d, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vasub_vv_b, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vasub_vv_h, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vasub_vv_w, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vasub_vv_d, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vaadd_vx_b, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vaadd_vx_h, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vaadd_vx_w, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vaadd_vx_d, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vasub_vx_b, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vasub_vx_h, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vasub_vx_w, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vasub_vx_d, void, ptr, ptr, tl, ptr, env, i32)
diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index 44baadf582..0227a16b16 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -412,6 +412,11 @@  vssubu_vv       100010 . ..... ..... 000 ..... 1010111 @r_vm
 vssubu_vx       100010 . ..... ..... 100 ..... 1010111 @r_vm
 vssub_vv        100011 . ..... ..... 000 ..... 1010111 @r_vm
 vssub_vx        100011 . ..... ..... 100 ..... 1010111 @r_vm
+vaadd_vv        100100 . ..... ..... 000 ..... 1010111 @r_vm
+vaadd_vx        100100 . ..... ..... 100 ..... 1010111 @r_vm
+vaadd_vi        100100 . ..... ..... 011 ..... 1010111 @r_vm
+vasub_vv        100110 . ..... ..... 000 ..... 1010111 @r_vm
+vasub_vx        100110 . ..... ..... 100 ..... 1010111 @r_vm
 
 vsetvli         0 ........... ..... 111 ..... 1010111  @r2_zimm
 vsetvl          1000000 ..... ..... 111 ..... 1010111  @r
diff --git a/target/riscv/insn_trans/trans_rvv.inc.c b/target/riscv/insn_trans/trans_rvv.inc.c
index ad55766b98..9988fad2fe 100644
--- a/target/riscv/insn_trans/trans_rvv.inc.c
+++ b/target/riscv/insn_trans/trans_rvv.inc.c
@@ -1521,3 +1521,10 @@  GEN_OPIVX_TRANS(vssubu_vx,  opivx_check)
 GEN_OPIVX_TRANS(vssub_vx,  opivx_check)
 GEN_OPIVI_TRANS(vsaddu_vi, 1, vsaddu_vx, opivx_check)
 GEN_OPIVI_TRANS(vsadd_vi, 0, vsadd_vx, opivx_check)
+
+/* Vector Single-Width Averaging Add and Subtract */
+GEN_OPIVV_TRANS(vaadd_vv, opivv_check)
+GEN_OPIVV_TRANS(vasub_vv, opivv_check)
+GEN_OPIVX_TRANS(vaadd_vx,  opivx_check)
+GEN_OPIVX_TRANS(vasub_vx,  opivx_check)
+GEN_OPIVI_TRANS(vaadd_vi, 0, vaadd_vx, opivx_check)
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index c7b8c1bff4..b0a7a3b6e4 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -2291,3 +2291,132 @@  GEN_VEXT_VX_ENV(vssub_vx_b, 1, 1, clearb)
 GEN_VEXT_VX_ENV(vssub_vx_h, 2, 2, clearh)
 GEN_VEXT_VX_ENV(vssub_vx_w, 4, 4, clearl)
 GEN_VEXT_VX_ENV(vssub_vx_d, 8, 8, clearq)
+
+/* Vector Single-Width Averaging Add and Subtract */
+static inline uint8_t get_round(CPURISCVState *env, uint64_t v, uint8_t shift)
+{
+    uint8_t d = extract64(v, shift, 1);
+    uint8_t d1;
+    uint64_t D1, D2;
+    int mod = env->vxrm;
+
+    if (shift == 0 || shift > 64) {
+        return 0;
+    }
+
+    d1 = extract64(v, shift - 1, 1);
+    D1 = extract64(v, 0, shift);
+    if (mod == 0) { /* round-to-nearest-up (add +0.5 LSB) */
+        return d1;
+    } else if (mod == 1) { /* round-to-nearest-even */
+        if (shift > 1) {
+            D2 = extract64(v, 0, shift - 1);
+            return d1 & ((D2 != 0) | d);
+        } else {
+            return d1 & d;
+        }
+    } else if (mod == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */
+        return !d & (D1 != 0);
+    }
+    return 0; /* round-down (truncate) */
+}
+
+static inline int8_t aadd8(CPURISCVState *env, int8_t a, int8_t b)
+{
+    int16_t res = (int16_t)a + (int16_t)b;
+    uint8_t round = get_round(env, res, 1);
+    res   = (res >> 1) + round;
+    return res;
+}
+static inline int16_t aadd16(CPURISCVState *env, int16_t a, int16_t b)
+{
+    int32_t res = (int32_t)a + (int32_t)b;
+    uint8_t round = get_round(env, res, 1);
+    res   = (res >> 1) + round;
+    return res;
+}
+static inline int32_t aadd32(CPURISCVState *env, int32_t a, int32_t b)
+{
+    int64_t res = (int64_t)a + (int64_t)b;
+    uint8_t round = get_round(env, res, 1);
+    res   = (res >> 1) + round;
+    return res;
+}
+static inline int64_t aadd64(CPURISCVState *env, int64_t a, int64_t b)
+{
+    int64_t res = (int64_t)a + (int64_t)b;
+    uint8_t round = get_round(env, res, 1); /* get_round only need v[d : 0] */
+    if (((res ^ a) & (res ^ b)) >> 63 == -1LL) { /* overflow */
+        res = ((res >> 1) ^ INT64_MIN) + round;
+    } else {
+        res   = (res >> 1) + round;
+    }
+    return res;
+}
+RVVCALL(OPIVV2_ENV, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd8)
+RVVCALL(OPIVV2_ENV, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd16)
+RVVCALL(OPIVV2_ENV, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32)
+RVVCALL(OPIVV2_ENV, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64)
+GEN_VEXT_VV_ENV(vaadd_vv_b, 1, 1, clearb)
+GEN_VEXT_VV_ENV(vaadd_vv_h, 2, 2, clearh)
+GEN_VEXT_VV_ENV(vaadd_vv_w, 4, 4, clearl)
+GEN_VEXT_VV_ENV(vaadd_vv_d, 8, 8, clearq)
+
+RVVCALL(OPIVX2_ENV, vaadd_vx_b, OP_SSS_B, H1, H1, aadd8)
+RVVCALL(OPIVX2_ENV, vaadd_vx_h, OP_SSS_H, H2, H2, aadd16)
+RVVCALL(OPIVX2_ENV, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32)
+RVVCALL(OPIVX2_ENV, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64)
+GEN_VEXT_VX_ENV(vaadd_vx_b, 1, 1, clearb)
+GEN_VEXT_VX_ENV(vaadd_vx_h, 2, 2, clearh)
+GEN_VEXT_VX_ENV(vaadd_vx_w, 4, 4, clearl)
+GEN_VEXT_VX_ENV(vaadd_vx_d, 8, 8, clearq)
+
+static inline int8_t asub8(CPURISCVState *env, int8_t a, int8_t b)
+{
+    int16_t res = (int16_t)a - (int16_t)b;
+    uint8_t round = get_round(env, res, 1);
+    res   = (res >> 1) + round;
+    return res;
+}
+static inline int16_t asub16(CPURISCVState *env, int16_t a, int16_t b)
+{
+    int32_t res = (int32_t)a - (int32_t)b;
+    uint8_t round = get_round(env, res, 1);
+    res   = (res >> 1) + round;
+    return res;
+}
+static inline int32_t asub32(CPURISCVState *env, int32_t a, int32_t b)
+{
+    int64_t res = (int64_t)a - (int64_t)b;
+    uint8_t round = get_round(env, res, 1);
+    res   = (res >> 1) + round;
+    return res;
+}
+static inline int64_t asub64(CPURISCVState *env, int64_t a, int64_t b)
+{
+    int64_t res = (int64_t)a - (int64_t)b;
+    uint8_t round = get_round(env, res, 1); /* get_round only need v[d : 0] */
+    if (((res ^ a) & (a ^ b)) >> 63 == -1LL) { /* overflow */
+        res = ((res >> 1) ^ INT64_MIN) + round;
+    } else {
+        res   = (res >> 1) + round;
+    }
+    return res;
+}
+RVVCALL(OPIVV2_ENV, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub8)
+RVVCALL(OPIVV2_ENV, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub16)
+RVVCALL(OPIVV2_ENV, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32)
+RVVCALL(OPIVV2_ENV, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64)
+GEN_VEXT_VV_ENV(vasub_vv_b, 1, 1, clearb)
+GEN_VEXT_VV_ENV(vasub_vv_h, 2, 2, clearh)
+GEN_VEXT_VV_ENV(vasub_vv_w, 4, 4, clearl)
+GEN_VEXT_VV_ENV(vasub_vv_d, 8, 8, clearq)
+
+RVVCALL(OPIVX2_ENV, vasub_vx_b, OP_SSS_B, H1, H1, asub8)
+RVVCALL(OPIVX2_ENV, vasub_vx_h, OP_SSS_H, H2, H2, asub16)
+RVVCALL(OPIVX2_ENV, vasub_vx_w, OP_SSS_W, H4, H4, asub32)
+RVVCALL(OPIVX2_ENV, vasub_vx_d, OP_SSS_D, H8, H8, asub64)
+GEN_VEXT_VX_ENV(vasub_vx_b, 1, 1, clearb)
+GEN_VEXT_VX_ENV(vasub_vx_h, 2, 2, clearh)
+GEN_VEXT_VX_ENV(vasub_vx_w, 4, 4, clearl)
+GEN_VEXT_VX_ENV(vasub_vx_d, 8, 8, clearq)