Message ID | 20200312145900.2054-25-zhiwei_liu@c-sky.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | target/riscv: support vector extension v0.7.1 | expand |
On 3/12/20 7:58 AM, LIU Zhiwei wrote: > +/* Vector Single-Width Averaging Add and Subtract */ > +static inline uint8_t get_round(CPURISCVState *env, uint64_t v, uint8_t shift) > +{ > + uint8_t d = extract64(v, shift, 1); > + uint8_t d1; > + uint64_t D1, D2; > + int mod = env->vxrm; > + > + if (shift == 0 || shift > 64) { > + return 0; > + } > + > + d1 = extract64(v, shift - 1, 1); > + D1 = extract64(v, 0, shift); > + if (mod == 0) { /* round-to-nearest-up (add +0.5 LSB) */ > + return d1; > + } else if (mod == 1) { /* round-to-nearest-even */ > + if (shift > 1) { > + D2 = extract64(v, 0, shift - 1); > + return d1 & ((D2 != 0) | d); > + } else { > + return d1 & d; > + } > + } else if (mod == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */ > + return !d & (D1 != 0); > + } > + return 0; /* round-down (truncate) */ > +} > + > +static inline int8_t aadd8(CPURISCVState *env, int8_t a, int8_t b) > +{ > + int16_t res = (int16_t)a + (int16_t)b; > + uint8_t round = get_round(env, res, 1); > + res = (res >> 1) + round; > + return res; > +} I think this is a suboptimal way to arrange things. It leaves the vxrm lookup inside of the main loop, while it is obviously loop invariant. I think you should have 4 versions of aadd8, for each of the rounding modes, > +RVVCALL(OPIVV2_ENV, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd8) then use this, or something like it, to define 4 functions containing main loops, which will get the helper above inlined. Then use a final outermost wrapper to select one of the 4 functions based on env->vxrm. r~
On 3/14/20 1:14 AM, Richard Henderson wrote: > I think you should have 4 versions of aadd8, for each of the rounding modes, > >> +RVVCALL(OPIVV2_ENV, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd8) > > then use this, or something like it, to define 4 functions containing main > loops, which will get the helper above inlined. Alternately, a set of inlines, where a (constant) vxrm is passed down from above. > Then use a final outermost wrapper to select one of the 4 functions based on > env->vxrm. The outermost wrapper could look like switch (env->vxrm) { case 0: somefunc(some, args, 0); break; case 1: somefunc(some, args, 1); break; case 2: somefunc(some, args, 2); break; default: somefunc(some, args, 3); break; } so that somefunc (and its subroutines) are expanded with a constant, and we switch on that constant at the outermost level. r~
On 2020/3/14 16:25, Richard Henderson wrote: > On 3/14/20 1:14 AM, Richard Henderson wrote: >> I think you should have 4 versions of aadd8, for each of the rounding modes, >> >>> +RVVCALL(OPIVV2_ENV, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd8) >> then use this, or something like it, to define 4 functions containing main >> loops, which will get the helper above inlined. > Alternately, a set of inlines, where a (constant) vxrm is passed down from above. I am not sure whether I get it. In my opinion, the code should be modified like static inline int8_t aadd8_rnu(CPURISCVState *env, int8_t a, int8_t b) { int16_t res = (int16_t)a + (int16_t)b; uint8_t round = res & 0x1; res = (res >> 1) + round; return res; } static inline int8_t aadd8_rne(CPURISCVState *env, int8_t a, int8_t b) { int16_t res = (int16_t)a + (int16_t)b; uint8_t round = ((res & 0x3) == 0x3); res = (res >> 1) + round; return res; } static inline int8_t aadd8_rdn(CPURISCVState *env, int8_t a, int8_t b) { int16_t res = (int16_t)a + (int16_t)b; res = (res >> 1); return res; } static inline int8_t aadd8_rod(CPURISCVState *env, int8_t a, int8_t b) { int16_t res = (int16_t)a + (int16_t)b; uint8_t round = ((res & 0x3) == 0x1); res = (res >> 1) + round; return res; } RVVCALL(OPIVV2_ENV, vaadd_vv_b_rnu, OP_SSS_B, H1, H1, H1, aadd8_rnu) RVVCALL(OPIVV2_ENV, vaadd_vv_b_rne, OP_SSS_B, H1, H1, H1, aadd8_rne) RVVCALL(OPIVV2_ENV, vaadd_vv_b_rdn, OP_SSS_B, H1, H1, H1, aadd8_rdn) RVVCALL(OPIVV2_ENV, vaadd_vv_b_rod, OP_SSS_B, H1, H1, H1, aadd8_rod) void do_vext_vv_env(void *vd, void *v0, void *vs1, void *vs2, CPURISCVState *env, uint32_t desc, uint32_t esz, uint32_t dsz, opivv2_fn *fn, clear_fn *clearfn) { uint32_t vlmax = vext_maxsz(desc) / esz; uint32_t mlen = vext_mlen(desc); uint32_t vm = vext_vm(desc); uint32_t vl = env->vl; uint32_t i; for (i = 0; i < vl; i++) { if (!vm && !vext_elem_mask(v0, mlen, i)) { continue; } fn(vd, vs1, vs2, i, env); } if (i != 0) { clear_fn(vd, vl, vl * dsz, vlmax * dsz); } } #define GEN_VEXT_VV_ENV(NAME, ESZ, DSZ, CLEAR_FN) \ void HELPER(NAME)(void *vd, void *v0, void *vs1, \ void *vs2, CPURISCVState *env, \ uint32_t desc) \ { \ static opivv2_fn *fns[4] = { \ NAME##_rnu, NAME##_rne, \ NAME##_rdn, NAME##_rod \ } \ return do_vext_vv_env(vd, v0, vs1, vs2, env, desc, \ ESZ, DSZ, fns[env->vxrm], \ CLEAR_FN); \ } Is it true? Zhiwei >> Then use a final outermost wrapper to select one of the 4 functions based on >> env->vxrm. > The outermost wrapper could look like > > switch (env->vxrm) { > case 0: somefunc(some, args, 0); break; > case 1: somefunc(some, args, 1); break; > case 2: somefunc(some, args, 2); break; > default: somefunc(some, args, 3); break; > } > > so that somefunc (and its subroutines) are expanded with a constant, and we > switch on that constant at the outermost level. > > > r~
On 3/14/20 4:12 PM, LIU Zhiwei wrote: > I am not sure whether I get it. In my opinion, the code should be modified like > > static inline int8_t aadd8_rnu(CPURISCVState *env, int8_t a, int8_t b) > { > int16_t res = (int16_t)a + (int16_t)b; > uint8_t round = res & 0x1; > res = (res >> 1) + round; > return res; > } > > static inline int8_t aadd8_rne(CPURISCVState *env, int8_t a, int8_t b) > { > int16_t res = (int16_t)a + (int16_t)b; > uint8_t round = ((res & 0x3) == 0x3); > res = (res >> 1) + round; > return res; > } > > static inline int8_t aadd8_rdn(CPURISCVState *env, int8_t a, int8_t b) > { > int16_t res = (int16_t)a + (int16_t)b; > res = (res >> 1); > return res; > } > > static inline int8_t aadd8_rod(CPURISCVState *env, int8_t a, int8_t b) > { > int16_t res = (int16_t)a + (int16_t)b; > uint8_t round = ((res & 0x3) == 0x1); > res = (res >> 1) + round; > return res; > } > > RVVCALL(OPIVV2_ENV, vaadd_vv_b_rnu, OP_SSS_B, H1, H1, H1, aadd8_rnu) > RVVCALL(OPIVV2_ENV, vaadd_vv_b_rne, OP_SSS_B, H1, H1, H1, aadd8_rne) > RVVCALL(OPIVV2_ENV, vaadd_vv_b_rdn, OP_SSS_B, H1, H1, H1, aadd8_rdn) > RVVCALL(OPIVV2_ENV, vaadd_vv_b_rod, OP_SSS_B, H1, H1, H1, aadd8_rod) > > void do_vext_vv_env(void *vd, void *v0, void *vs1, > void *vs2, CPURISCVState *env, uint32_t desc, > uint32_t esz, uint32_t dsz, > opivv2_fn *fn, clear_fn *clearfn) > { > uint32_t vlmax = vext_maxsz(desc) / esz; > uint32_t mlen = vext_mlen(desc); > uint32_t vm = vext_vm(desc); > uint32_t vl = env->vl; > uint32_t i; > for (i = 0; i < vl; i++) { > if (!vm && !vext_elem_mask(v0, mlen, i)) { > continue; > } > fn(vd, vs1, vs2, i, env); > } > if (i != 0) { > clear_fn(vd, vl, vl * dsz, vlmax * dsz); > } > } > > #define GEN_VEXT_VV_ENV(NAME, ESZ, DSZ, CLEAR_FN) \ > void HELPER(NAME)(void *vd, void *v0, void *vs1, \ > void *vs2, CPURISCVState *env, \ > uint32_t desc) \ > { \ > static opivv2_fn *fns[4] = { \ > NAME##_rnu, NAME##_rne, \ > NAME##_rdn, NAME##_rod \ > } \ > return do_vext_vv_env(vd, v0, vs1, vs2, env, desc, \ > ESZ, DSZ, fns[env->vxrm], \ > CLEAR_FN); \ > } > > Is it true? While that does look good for this case, there are many other uses of get_round(), and it may not be quite as simple there. My suggestion was static inline int32_t aadd32(int vxrm, int32_t a, int32_t b) { int64_t res = (int64_t)a + b; uint8_t round = get_round(vxrm, res, 1); return (res >> 1) + round; } static inline int64_t aadd64(int vxrm, int64_t a, int64_t b) { int64_t res = a + b; uint8_t round = get_round(vxrm, res, 1); int64_t over = (res ^ a) & (res ^ b) & INT64_MIN; /* With signed overflow, bit 64 is inverse of bit 63. */ return ((res >> 1) ^ over) + round; } RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32) RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32) RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32) RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64) static inline void vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2, uint32_t vl, uint32_t vm, uint32_t mlen, int vxrm, opivv2_rm_fn *fn) { for (uint32_t i = 0; i < vl; i++) { if (!vm && !vext_elem_mask(v0, mlen, i)) { continue; } fn(vd, vs1, vs2, i, vxrm); } } static inline void vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2, CPURISCVState *env, uint32_t desc, uint32_t esz, uint32_t dsz, opivv2_rm_fn *fn, clear_fn *clearfn) { uint32_t vlmax = vext_maxsz(desc) / esz; uint32_t mlen = vext_mlen(desc); uint32_t vm = vext_vm(desc); uint32_t vl = env->vl; if (vl == 0) { return; } switch (env->vxrm) { case 0: /* rnu */ vext_vv_rm_1(vd, v0, vs1, vs2, vl, vm, mlen, 0, fn); break; case 1: /* rne */ vext_vv_rm_1(vd, v0, vs1, vs2, vl, vm, mlen, 1, fn); break; case 2: /* rdn */ vext_vv_rm_1(vd, v0, vs1, vs2, vl, vm, mlen, 2, fn); break; default: /* rod */ vext_vv_rm_1(vd, v0, vs1, vs2, vl, vm, mlen, 3, fn); break; } clear_fn(vd, vl, vl * dsz, vlmax * dsz); } >From vext_vv_rm_2, a constant is passed down all of the inline functions, so that a constant arrives in get_round() at the bottom of the call chain. At which point all of the expressions get folded by the compiler and we *should* get very similar generated code as to what you have above. r~
On 2020/3/15 9:00, Richard Henderson wrote: > On 3/14/20 4:12 PM, LIU Zhiwei wrote: >> I am not sure whether I get it. In my opinion, the code should be modified like >> >> static inline int8_t aadd8_rnu(CPURISCVState *env, int8_t a, int8_t b) >> { >> int16_t res = (int16_t)a + (int16_t)b; >> uint8_t round = res & 0x1; >> res = (res >> 1) + round; >> return res; >> } >> >> static inline int8_t aadd8_rne(CPURISCVState *env, int8_t a, int8_t b) >> { >> int16_t res = (int16_t)a + (int16_t)b; >> uint8_t round = ((res & 0x3) == 0x3); >> res = (res >> 1) + round; >> return res; >> } >> >> static inline int8_t aadd8_rdn(CPURISCVState *env, int8_t a, int8_t b) >> { >> int16_t res = (int16_t)a + (int16_t)b; >> res = (res >> 1); >> return res; >> } >> >> static inline int8_t aadd8_rod(CPURISCVState *env, int8_t a, int8_t b) >> { >> int16_t res = (int16_t)a + (int16_t)b; >> uint8_t round = ((res & 0x3) == 0x1); >> res = (res >> 1) + round; >> return res; >> } >> >> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rnu, OP_SSS_B, H1, H1, H1, aadd8_rnu) >> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rne, OP_SSS_B, H1, H1, H1, aadd8_rne) >> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rdn, OP_SSS_B, H1, H1, H1, aadd8_rdn) >> RVVCALL(OPIVV2_ENV, vaadd_vv_b_rod, OP_SSS_B, H1, H1, H1, aadd8_rod) >> >> void do_vext_vv_env(void *vd, void *v0, void *vs1, >> void *vs2, CPURISCVState *env, uint32_t desc, >> uint32_t esz, uint32_t dsz, >> opivv2_fn *fn, clear_fn *clearfn) >> { >> uint32_t vlmax = vext_maxsz(desc) / esz; >> uint32_t mlen = vext_mlen(desc); >> uint32_t vm = vext_vm(desc); >> uint32_t vl = env->vl; >> uint32_t i; >> for (i = 0; i < vl; i++) { >> if (!vm && !vext_elem_mask(v0, mlen, i)) { >> continue; >> } >> fn(vd, vs1, vs2, i, env); >> } >> if (i != 0) { >> clear_fn(vd, vl, vl * dsz, vlmax * dsz); >> } >> } >> >> #define GEN_VEXT_VV_ENV(NAME, ESZ, DSZ, CLEAR_FN) \ >> void HELPER(NAME)(void *vd, void *v0, void *vs1, \ >> void *vs2, CPURISCVState *env, \ >> uint32_t desc) \ >> { \ >> static opivv2_fn *fns[4] = { \ >> NAME##_rnu, NAME##_rne, \ >> NAME##_rdn, NAME##_rod \ >> } \ >> return do_vext_vv_env(vd, v0, vs1, vs2, env, desc, \ >> ESZ, DSZ, fns[env->vxrm], \ >> CLEAR_FN); \ >> } >> >> Is it true? > While that does look good for this case, there are many other uses of > get_round(), and it may not be quite as simple there. > > My suggestion was > > static inline int32_t aadd32(int vxrm, int32_t a, int32_t b) > { > int64_t res = (int64_t)a + b; > uint8_t round = get_round(vxrm, res, 1); > > return (res >> 1) + round; > } > > static inline int64_t aadd64(int vxrm, int64_t a, int64_t b) > { > int64_t res = a + b; > uint8_t round = get_round(vxrm, res, 1); > int64_t over = (res ^ a) & (res ^ b) & INT64_MIN; > > /* With signed overflow, bit 64 is inverse of bit 63. */ > return ((res >> 1) ^ over) + round; > } > > RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32) > RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32) > RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32) > RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64) > > static inline void > vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2, > uint32_t vl, uint32_t vm, uint32_t mlen, int vxrm, > opivv2_rm_fn *fn) > { > for (uint32_t i = 0; i < vl; i++) { > if (!vm && !vext_elem_mask(v0, mlen, i)) { > continue; > } > fn(vd, vs1, vs2, i, vxrm); > } > } > > static inline void > vext_vv_rm_2(void *vd, void *v0, void *vs1, > void *vs2, CPURISCVState *env, uint32_t desc, > uint32_t esz, uint32_t dsz, > opivv2_rm_fn *fn, clear_fn *clearfn) > { > uint32_t vlmax = vext_maxsz(desc) / esz; > uint32_t mlen = vext_mlen(desc); > uint32_t vm = vext_vm(desc); > uint32_t vl = env->vl; > > if (vl == 0) { > return; > } > > switch (env->vxrm) { > case 0: /* rnu */ > vext_vv_rm_1(vd, v0, vs1, vs2, > vl, vm, mlen, 0, fn); > break; > case 1: /* rne */ > vext_vv_rm_1(vd, v0, vs1, vs2, > vl, vm, mlen, 1, fn); > break; > case 2: /* rdn */ > vext_vv_rm_1(vd, v0, vs1, vs2, > vl, vm, mlen, 2, fn); > break; > default: /* rod */ > vext_vv_rm_1(vd, v0, vs1, vs2, > vl, vm, mlen, 3, fn); > break; > } > > clear_fn(vd, vl, vl * dsz, vlmax * dsz); > } > > >From vext_vv_rm_2, a constant is passed down all of the inline functions, so > that a constant arrives in get_round() at the bottom of the call chain. At > which point all of the expressions get folded by the compiler and we *should* > get very similar generated code as to what you have above. Yes, it will be much better. I still have one question here. Many other fixed point instructions also need vxsat besides vxsrm. In that cases, can I just define OPIVV2_RM like this: #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \ static inline void \ do_##NAME(void *vd, void *vs1, void *vs2, int i, \ CPURISCVState *env, int vxrm) \ { \ TX1 s1 = *((T1 *)vs1 + HS1(i)); \ TX2 s2 = *((T2 *)vs2 + HS2(i)); \ *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1); \ } static inline int32_t aadd32(|__attribute__((unused)) |CPURISCVState *env, int vxrm, int32_t a, int32_t b) { int64_t res = (int64_t)a + b; uint8_t round = get_round(vxrm, res, 1); return (res >> 1) + round; } In this way, I can write just one OPIVV2_RM instead of (OPIVV2_RM, OPIVV2_RM_ENV, OPIVV2_ENV). Zhiwei > > r~
On 3/15/20 4:23 PM, LIU Zhiwei wrote: > Many other fixed point instructions also need vxsat besides vxsrm. Ah yes. > In that cases, can I just define OPIVV2_RM like this: > > #define OPIVV2_RM(NAME, TD, T1, T2, TX1, TX2, HD, HS1, HS2, OP) \ > static inline void \ > do_##NAME(void *vd, void *vs1, void *vs2, int i, \ > CPURISCVState *env, int vxrm) \ > { \ > TX1 s1 = *((T1 *)vs1 + HS1(i)); \ > TX2 s2 = *((T2 *)vs2 + HS2(i)); \ > *((TD *)vd + HD(i)) = OP(env, vxrm, s2, s1); \ > } > > static inline int32_t aadd32(|__attribute__((unused)) |CPURISCVState *env, > int vxrm, int32_t a, int32_t b) You can drop the unused. We don't turn on warnings for unused arguments, as we have a *lot* of them for exactly this reason -- keeping a common functional interface. > { > int64_t res = (int64_t)a + b; > uint8_t round = get_round(vxrm, res, 1); > > return (res >> 1) + round; > } > > > In this way, I can write just one OPIVV2_RM instead of (OPIVV2_RM, > OPIVV2_RM_ENV, OPIVV2_ENV). Yes, that's fine. r~
diff --git a/target/riscv/helper.h b/target/riscv/helper.h index 95da00d365..d3837d2ca4 100644 --- a/target/riscv/helper.h +++ b/target/riscv/helper.h @@ -707,3 +707,20 @@ DEF_HELPER_6(vssub_vx_b, void, ptr, ptr, tl, ptr, env, i32) DEF_HELPER_6(vssub_vx_h, void, ptr, ptr, tl, ptr, env, i32) DEF_HELPER_6(vssub_vx_w, void, ptr, ptr, tl, ptr, env, i32) DEF_HELPER_6(vssub_vx_d, void, ptr, ptr, tl, ptr, env, i32) + +DEF_HELPER_6(vaadd_vv_b, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vaadd_vv_h, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vaadd_vv_w, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vaadd_vv_d, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vasub_vv_b, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vasub_vv_h, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vasub_vv_w, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vasub_vv_d, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vaadd_vx_b, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vaadd_vx_h, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vaadd_vx_w, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vaadd_vx_d, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vasub_vx_b, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vasub_vx_h, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vasub_vx_w, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vasub_vx_d, void, ptr, ptr, tl, ptr, env, i32) diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode index 44baadf582..0227a16b16 100644 --- a/target/riscv/insn32.decode +++ b/target/riscv/insn32.decode @@ -412,6 +412,11 @@ vssubu_vv 100010 . ..... ..... 000 ..... 1010111 @r_vm vssubu_vx 100010 . ..... ..... 100 ..... 1010111 @r_vm vssub_vv 100011 . ..... ..... 000 ..... 1010111 @r_vm vssub_vx 100011 . ..... ..... 100 ..... 1010111 @r_vm +vaadd_vv 100100 . ..... ..... 000 ..... 1010111 @r_vm +vaadd_vx 100100 . ..... ..... 100 ..... 1010111 @r_vm +vaadd_vi 100100 . ..... ..... 011 ..... 1010111 @r_vm +vasub_vv 100110 . ..... ..... 000 ..... 1010111 @r_vm +vasub_vx 100110 . ..... ..... 100 ..... 1010111 @r_vm vsetvli 0 ........... ..... 111 ..... 1010111 @r2_zimm vsetvl 1000000 ..... ..... 111 ..... 1010111 @r diff --git a/target/riscv/insn_trans/trans_rvv.inc.c b/target/riscv/insn_trans/trans_rvv.inc.c index ad55766b98..9988fad2fe 100644 --- a/target/riscv/insn_trans/trans_rvv.inc.c +++ b/target/riscv/insn_trans/trans_rvv.inc.c @@ -1521,3 +1521,10 @@ GEN_OPIVX_TRANS(vssubu_vx, opivx_check) GEN_OPIVX_TRANS(vssub_vx, opivx_check) GEN_OPIVI_TRANS(vsaddu_vi, 1, vsaddu_vx, opivx_check) GEN_OPIVI_TRANS(vsadd_vi, 0, vsadd_vx, opivx_check) + +/* Vector Single-Width Averaging Add and Subtract */ +GEN_OPIVV_TRANS(vaadd_vv, opivv_check) +GEN_OPIVV_TRANS(vasub_vv, opivv_check) +GEN_OPIVX_TRANS(vaadd_vx, opivx_check) +GEN_OPIVX_TRANS(vasub_vx, opivx_check) +GEN_OPIVI_TRANS(vaadd_vi, 0, vaadd_vx, opivx_check) diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index c7b8c1bff4..b0a7a3b6e4 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -2291,3 +2291,132 @@ GEN_VEXT_VX_ENV(vssub_vx_b, 1, 1, clearb) GEN_VEXT_VX_ENV(vssub_vx_h, 2, 2, clearh) GEN_VEXT_VX_ENV(vssub_vx_w, 4, 4, clearl) GEN_VEXT_VX_ENV(vssub_vx_d, 8, 8, clearq) + +/* Vector Single-Width Averaging Add and Subtract */ +static inline uint8_t get_round(CPURISCVState *env, uint64_t v, uint8_t shift) +{ + uint8_t d = extract64(v, shift, 1); + uint8_t d1; + uint64_t D1, D2; + int mod = env->vxrm; + + if (shift == 0 || shift > 64) { + return 0; + } + + d1 = extract64(v, shift - 1, 1); + D1 = extract64(v, 0, shift); + if (mod == 0) { /* round-to-nearest-up (add +0.5 LSB) */ + return d1; + } else if (mod == 1) { /* round-to-nearest-even */ + if (shift > 1) { + D2 = extract64(v, 0, shift - 1); + return d1 & ((D2 != 0) | d); + } else { + return d1 & d; + } + } else if (mod == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */ + return !d & (D1 != 0); + } + return 0; /* round-down (truncate) */ +} + +static inline int8_t aadd8(CPURISCVState *env, int8_t a, int8_t b) +{ + int16_t res = (int16_t)a + (int16_t)b; + uint8_t round = get_round(env, res, 1); + res = (res >> 1) + round; + return res; +} +static inline int16_t aadd16(CPURISCVState *env, int16_t a, int16_t b) +{ + int32_t res = (int32_t)a + (int32_t)b; + uint8_t round = get_round(env, res, 1); + res = (res >> 1) + round; + return res; +} +static inline int32_t aadd32(CPURISCVState *env, int32_t a, int32_t b) +{ + int64_t res = (int64_t)a + (int64_t)b; + uint8_t round = get_round(env, res, 1); + res = (res >> 1) + round; + return res; +} +static inline int64_t aadd64(CPURISCVState *env, int64_t a, int64_t b) +{ + int64_t res = (int64_t)a + (int64_t)b; + uint8_t round = get_round(env, res, 1); /* get_round only need v[d : 0] */ + if (((res ^ a) & (res ^ b)) >> 63 == -1LL) { /* overflow */ + res = ((res >> 1) ^ INT64_MIN) + round; + } else { + res = (res >> 1) + round; + } + return res; +} +RVVCALL(OPIVV2_ENV, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd8) +RVVCALL(OPIVV2_ENV, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd16) +RVVCALL(OPIVV2_ENV, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32) +RVVCALL(OPIVV2_ENV, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64) +GEN_VEXT_VV_ENV(vaadd_vv_b, 1, 1, clearb) +GEN_VEXT_VV_ENV(vaadd_vv_h, 2, 2, clearh) +GEN_VEXT_VV_ENV(vaadd_vv_w, 4, 4, clearl) +GEN_VEXT_VV_ENV(vaadd_vv_d, 8, 8, clearq) + +RVVCALL(OPIVX2_ENV, vaadd_vx_b, OP_SSS_B, H1, H1, aadd8) +RVVCALL(OPIVX2_ENV, vaadd_vx_h, OP_SSS_H, H2, H2, aadd16) +RVVCALL(OPIVX2_ENV, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32) +RVVCALL(OPIVX2_ENV, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64) +GEN_VEXT_VX_ENV(vaadd_vx_b, 1, 1, clearb) +GEN_VEXT_VX_ENV(vaadd_vx_h, 2, 2, clearh) +GEN_VEXT_VX_ENV(vaadd_vx_w, 4, 4, clearl) +GEN_VEXT_VX_ENV(vaadd_vx_d, 8, 8, clearq) + +static inline int8_t asub8(CPURISCVState *env, int8_t a, int8_t b) +{ + int16_t res = (int16_t)a - (int16_t)b; + uint8_t round = get_round(env, res, 1); + res = (res >> 1) + round; + return res; +} +static inline int16_t asub16(CPURISCVState *env, int16_t a, int16_t b) +{ + int32_t res = (int32_t)a - (int32_t)b; + uint8_t round = get_round(env, res, 1); + res = (res >> 1) + round; + return res; +} +static inline int32_t asub32(CPURISCVState *env, int32_t a, int32_t b) +{ + int64_t res = (int64_t)a - (int64_t)b; + uint8_t round = get_round(env, res, 1); + res = (res >> 1) + round; + return res; +} +static inline int64_t asub64(CPURISCVState *env, int64_t a, int64_t b) +{ + int64_t res = (int64_t)a - (int64_t)b; + uint8_t round = get_round(env, res, 1); /* get_round only need v[d : 0] */ + if (((res ^ a) & (a ^ b)) >> 63 == -1LL) { /* overflow */ + res = ((res >> 1) ^ INT64_MIN) + round; + } else { + res = (res >> 1) + round; + } + return res; +} +RVVCALL(OPIVV2_ENV, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub8) +RVVCALL(OPIVV2_ENV, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub16) +RVVCALL(OPIVV2_ENV, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32) +RVVCALL(OPIVV2_ENV, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64) +GEN_VEXT_VV_ENV(vasub_vv_b, 1, 1, clearb) +GEN_VEXT_VV_ENV(vasub_vv_h, 2, 2, clearh) +GEN_VEXT_VV_ENV(vasub_vv_w, 4, 4, clearl) +GEN_VEXT_VV_ENV(vasub_vv_d, 8, 8, clearq) + +RVVCALL(OPIVX2_ENV, vasub_vx_b, OP_SSS_B, H1, H1, asub8) +RVVCALL(OPIVX2_ENV, vasub_vx_h, OP_SSS_H, H2, H2, asub16) +RVVCALL(OPIVX2_ENV, vasub_vx_w, OP_SSS_W, H4, H4, asub32) +RVVCALL(OPIVX2_ENV, vasub_vx_d, OP_SSS_D, H8, H8, asub64) +GEN_VEXT_VX_ENV(vasub_vx_b, 1, 1, clearb) +GEN_VEXT_VX_ENV(vasub_vx_h, 2, 2, clearh) +GEN_VEXT_VX_ENV(vasub_vx_w, 4, 4, clearl) +GEN_VEXT_VX_ENV(vasub_vx_d, 8, 8, clearq)
Signed-off-by: LIU Zhiwei <zhiwei_liu@c-sky.com> --- target/riscv/helper.h | 17 ++++ target/riscv/insn32.decode | 5 + target/riscv/insn_trans/trans_rvv.inc.c | 7 ++ target/riscv/vector_helper.c | 129 ++++++++++++++++++++++++ 4 files changed, 158 insertions(+)