Message ID | 20200317150653.9008-26-zhiwei_liu@c-sky.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | target/riscv: support vector extension v0.7.1 | expand |
On 2020/3/17 23:06, LIU Zhiwei wrote: Signed-off-by: LIU Zhiwei <zhiwei_liu@c-sky.com> --- target/riscv/helper.h | 17 ++++ target/riscv/insn32.decode | 5 ++ target/riscv/insn_trans/trans_rvv.inc.c | 7 ++ target/riscv/vector_helper.c | 100 ++++++++++++++++++++++++ 4 files changed, 129 insertions(+) diff --git a/target/riscv/helper.h b/target/riscv/helper.h index fd1c184852..311ce1322c 100644 --- a/target/riscv/helper.h +++ b/target/riscv/helper.h @@ -715,3 +715,20 @@ DEF_HELPER_6(vssub_vx_b, void, ptr, ptr, tl, ptr, env, i32) DEF_HELPER_6(vssub_vx_h, void, ptr, ptr, tl, ptr, env, i32) DEF_HELPER_6(vssub_vx_w, void, ptr, ptr, tl, ptr, env, i32) DEF_HELPER_6(vssub_vx_d, void, ptr, ptr, tl, ptr, env, i32) + +DEF_HELPER_6(vaadd_vv_b, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vaadd_vv_h, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vaadd_vv_w, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vaadd_vv_d, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vasub_vv_b, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vasub_vv_h, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vasub_vv_w, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vasub_vv_d, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vaadd_vx_b, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vaadd_vx_h, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vaadd_vx_w, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vaadd_vx_d, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vasub_vx_b, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vasub_vx_h, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vasub_vx_w, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vasub_vx_d, void, ptr, ptr, tl, ptr, env, i32) diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode index c9a4050adc..e617d7bd60 100644 --- a/target/riscv/insn32.decode +++ b/target/riscv/insn32.decode @@ -417,6 +417,11 @@ vssubu_vv 100010 . ..... ..... 000 ..... 1010111 @r_vm vssubu_vx 100010 . ..... ..... 100 ..... 1010111 @r_vm vssub_vv 100011 . ..... ..... 000 ..... 1010111 @r_vm vssub_vx 100011 . ..... ..... 100 ..... 1010111 @r_vm +vaadd_vv 100100 . ..... ..... 000 ..... 1010111 @r_vm +vaadd_vx 100100 . ..... ..... 100 ..... 1010111 @r_vm +vaadd_vi 100100 . ..... ..... 011 ..... 1010111 @r_vm +vasub_vv 100110 . ..... ..... 000 ..... 1010111 @r_vm +vasub_vx 100110 . ..... ..... 100 ..... 1010111 @r_vm vsetvli 0 ........... ..... 111 ..... 1010111 @r2_zimm vsetvl 1000000 ..... ..... 111 ..... 1010111 @r diff --git a/target/riscv/insn_trans/trans_rvv.inc.c b/target/riscv/insn_trans/trans_rvv.inc.c index dd1a508a51..ba2e7d56f4 100644 --- a/target/riscv/insn_trans/trans_rvv.inc.c +++ b/target/riscv/insn_trans/trans_rvv.inc.c @@ -1636,3 +1636,10 @@ GEN_OPIVX_TRANS(vssubu_vx, opivx_check) GEN_OPIVX_TRANS(vssub_vx, opivx_check) GEN_OPIVI_TRANS(vsaddu_vi, 1, vsaddu_vx, opivx_check) GEN_OPIVI_TRANS(vsadd_vi, 0, vsadd_vx, opivx_check) + +/* Vector Single-Width Averaging Add and Subtract */ +GEN_OPIVV_TRANS(vaadd_vv, opivv_check) +GEN_OPIVV_TRANS(vasub_vv, opivv_check) +GEN_OPIVX_TRANS(vaadd_vx, opivx_check) +GEN_OPIVX_TRANS(vasub_vx, opivx_check) +GEN_OPIVI_TRANS(vaadd_vi, 0, vaadd_vx, opivx_check) diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index b17cac7fd4..984a8e260f 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -2488,3 +2488,103 @@ GEN_VEXT_VX_RM(vssub_vx_b, 1, 1, clearb) GEN_VEXT_VX_RM(vssub_vx_h, 2, 2, clearh) GEN_VEXT_VX_RM(vssub_vx_w, 4, 4, clearl) GEN_VEXT_VX_RM(vssub_vx_d, 8, 8, clearq) + +/* Vector Single-Width Averaging Add and Subtract */ +static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift) +{ + uint8_t d = extract64(v, shift, 1); + uint8_t d1; + uint64_t D1, D2; + + if (shift == 0 || shift > 64) { + return 0; + } + + d1 = extract64(v, shift - 1, 1); + D1 = extract64(v, 0, shift); + if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */ + return d1; + } else if (vxrm == 1) { /* round-to-nearest-even */ + if (shift > 1) { + D2 = extract64(v, 0, shift - 1); + return d1 & ((D2 != 0) | d); + } else { + return d1 & d; + } + } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */ + return !d & (D1 != 0); + } + return 0; /* round-down (truncate) */ +} + +static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b) +{ + int64_t res = (int64_t)a + b; + uint8_t round = get_round(vxrm, res, 1); + + return (res >> 1) + round; +} + +static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b) +{ + int64_t res = a + b; + uint8_t round = get_round(vxrm, res, 1); + int64_t over = (res ^ a) & (res ^ b) & INT64_MIN; + + /* With signed overflow, bit 64 is inverse of bit 63. */ + return ((res >> 1) ^ over) + round; +} + +RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32) +RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32) +RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32) +RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64) +GEN_VEXT_VV_RM(vaadd_vv_b, 1, 1, clearb) +GEN_VEXT_VV_RM(vaadd_vv_h, 2, 2, clearh) +GEN_VEXT_VV_RM(vaadd_vv_w, 4, 4, clearl) +GEN_VEXT_VV_RM(vaadd_vv_d, 8, 8, clearq) + +RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32) +RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32) +RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32) +RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64) +GEN_VEXT_VX_RM(vaadd_vx_b, 1, 1, clearb) +GEN_VEXT_VX_RM(vaadd_vx_h, 2, 2, clearh) +GEN_VEXT_VX_RM(vaadd_vx_w, 4, 4, clearl) +GEN_VEXT_VX_RM(vaadd_vx_d, 8, 8, clearq) + +static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b) +{ + int64_t res = (int64_t)a - b; + uint8_t round = get_round(vxrm, res, 1); + + return (res >> 1) + round; +} + I find a corner case here. As the spec said in Section 13.2 "There can be no overflow in the result". If the a is 0x7fffffff, b is 0x80000000, and the round mode is round to up(rnu), then the result is (0x7fffffff - 0x80000000 + 1) >> 1, equals 0x80000000, according the v0.7.1 # Averaging subtract # result = (src1 - src2 + round) >> 1; The result is also overflow, according to v0.8. It will be calculated that like here, roundoff_signed(vs2[i] - vs1[i], 1) roundoff_signed(v, d) = (signed(v) >> d) + r Should I check do a saturation here? Zhiwei > +static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b) > +{ > + int64_t res = (int64_t)a - b; > + uint8_t round = get_round(vxrm, res, 1); > + int64_t over = (res ^ a) & (a ^ b) & INT64_MIN; > + > + /* With signed overflow, bit 64 is inverse of bit 63. */ > + return ((res >> 1) ^ over) + round; > +} > + > +RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32) > +RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32) > +RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32) > +RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64) > +GEN_VEXT_VV_RM(vasub_vv_b, 1, 1, clearb) > +GEN_VEXT_VV_RM(vasub_vv_h, 2, 2, clearh) > +GEN_VEXT_VV_RM(vasub_vv_w, 4, 4, clearl) > +GEN_VEXT_VV_RM(vasub_vv_d, 8, 8, clearq) > + > +RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32) > +RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32) > +RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32) > +RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64) > +GEN_VEXT_VX_RM(vasub_vx_b, 1, 1, clearb) > +GEN_VEXT_VX_RM(vasub_vx_h, 2, 2, clearh) > +GEN_VEXT_VX_RM(vasub_vx_w, 4, 4, clearl) > +GEN_VEXT_VX_RM(vasub_vx_d, 8, 8, clearq)
On 3/18/20 8:46 PM, LIU Zhiwei wrote: > +static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b) > +{ > + int64_t res = (int64_t)a - b; > + uint8_t round = get_round(vxrm, res, 1); > + > + return (res >> 1) + round; > +} > + > > I find a corner case here. As the spec said in Section 13.2 > > "There can be no overflow in the result". > > If the a is 0x7fffffff, b is 0x80000000, and the round mode is round to up(rnu), > then the result is (0x7fffffff - 0x80000000 + 1) >> 1, equals 0x80000000, > according the v0.7.1 That's why we used int64_t as the intermediate type: 0x000000007fffffff - 0xffffffff80000000 + 1 = 0x000000007fffffff + 0x0000000080000000 + 1 = 0x00000000ffffffff + 1 = 0x0000000100000000 Shift that right by 1 and you do indeed get 0x80000000. There's no saturation involved. For int64_t we computed signed overflow to do the same thing. r~
On 2020/3/28 8:32, Richard Henderson wrote: > On 3/18/20 8:46 PM, LIU Zhiwei wrote: >> +static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b) >> +{ >> + int64_t res = (int64_t)a - b; >> + uint8_t round = get_round(vxrm, res, 1); >> + >> + return (res >> 1) + round; >> +} >> + >> >> I find a corner case here. As the spec said in Section 13.2 >> >> "There can be no overflow in the result". >> >> If the a is 0x7fffffff, b is 0x80000000, and the round mode is round to up(rnu), >> then the result is (0x7fffffff - 0x80000000 + 1) >> 1, equals 0x80000000, >> according the v0.7.1 > That's why we used int64_t as the intermediate type: > > 0x000000007fffffff - 0xffffffff80000000 + 1 > = 0x000000007fffffff + 0x0000000080000000 + 1 > = 0x00000000ffffffff + 1 > = 0x0000000100000000 > > Shift that right by 1 and you do indeed get 0x80000000. > There's no saturation involved. The minuend 0x7fffffff is INT32_MAX, and the subtrahend 0x80000000 is INT32_MIN. The difference between the minuendand the subtrahend should be a positive number. But the result here is 0x80000000. So it is overflow. However, according to the spec, it should not overflow. I think a special process for (INT*_MAX - INT*_MIN) is needed. Zhiwei > For int64_t we computed signed overflow to do the same thing. > > r~
On 3/27/20 6:07 PM, LIU Zhiwei wrote: > > > On 2020/3/28 8:32, Richard Henderson wrote: >> On 3/18/20 8:46 PM, LIU Zhiwei wrote: >>> +static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b) >>> +{ >>> + int64_t res = (int64_t)a - b; >>> + uint8_t round = get_round(vxrm, res, 1); >>> + >>> + return (res >> 1) + round; >>> +} >>> + >>> >>> I find a corner case here. As the spec said in Section 13.2 >>> >>> "There can be no overflow in the result". >>> >>> If the a is 0x7fffffff, b is 0x80000000, and the round mode is round to up(rnu), >>> then the result is (0x7fffffff - 0x80000000 + 1) >> 1, equals 0x80000000, >>> according the v0.7.1 >> That's why we used int64_t as the intermediate type: >> >> 0x000000007fffffff - 0xffffffff80000000 + 1 >> = 0x000000007fffffff + 0x0000000080000000 + 1 >> = 0x00000000ffffffff + 1 >> = 0x0000000100000000 >> >> Shift that right by 1 and you do indeed get 0x80000000. >> There's no saturation involved. > > The minuend 0x7fffffff is INT32_MAX, and the subtrahend 0x80000000 is INT32_MIN. > > The difference between the minuend and the subtrahend should be a positive > number. But the result here is 0x80000000. > > So it is overflow. However, according to the spec, it should not overflow. Unless I'm missing something, the spec is wrong about "there can be no overflow", the above being a counter-example. Do you have hardware to compare against? Perhaps it is in fact "overflow is ignored", as the 0.8 spec says for vasubu? I wouldn't add saturation, because the spec says nothing about saturation, and does mention truncation, at least for vasubu. r~
On 2020/3/28 9:22, Richard Henderson wrote: > On 3/27/20 6:07 PM, LIU Zhiwei wrote: >> >> On 2020/3/28 8:32, Richard Henderson wrote: >>> On 3/18/20 8:46 PM, LIU Zhiwei wrote: >>>> +static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b) >>>> +{ >>>> + int64_t res = (int64_t)a - b; >>>> + uint8_t round = get_round(vxrm, res, 1); >>>> + >>>> + return (res >> 1) + round; >>>> +} >>>> + >>>> >>>> I find a corner case here. As the spec said in Section 13.2 >>>> >>>> "There can be no overflow in the result". >>>> >>>> If the a is 0x7fffffff, b is 0x80000000, and the round mode is round to up(rnu), >>>> then the result is (0x7fffffff - 0x80000000 + 1) >> 1, equals 0x80000000, >>>> according the v0.7.1 >>> That's why we used int64_t as the intermediate type: >>> >>> 0x000000007fffffff - 0xffffffff80000000 + 1 >>> = 0x000000007fffffff + 0x0000000080000000 + 1 >>> = 0x00000000ffffffff + 1 >>> = 0x0000000100000000 >>> >>> Shift that right by 1 and you do indeed get 0x80000000. >>> There's no saturation involved. >> The minuend 0x7fffffff is INT32_MAX, and the subtrahend 0x80000000 is INT32_MIN. >> >> The difference between the minuend and the subtrahend should be a positive >> number. But the result here is 0x80000000. >> >> So it is overflow. However, according to the spec, it should not overflow. > Unless I'm missing something, the spec is wrong about "there can be no > overflow", the above being a counter-example. > > Do you have hardware to compare against? Perhaps it is in fact "overflow is > ignored", as the 0.8 spec says for vasubu? Agree! the overflow is just ignored. The code in the patch is OK now. I discussed it with hardware coworker and software coworker. The hardware coworker points that it is an error in spec. The software coworker think overflow will make this instruction some awkward, as the shift and round should protect the result from overflow. Like vasubu, overflow ignore is much better for vasub in this case. Zhiwei > > I wouldn't add saturation, because the spec says nothing about saturation, and > does mention truncation, at least for vasubu. > > > r~
diff --git a/target/riscv/helper.h b/target/riscv/helper.h index fd1c184852..311ce1322c 100644 --- a/target/riscv/helper.h +++ b/target/riscv/helper.h @@ -715,3 +715,20 @@ DEF_HELPER_6(vssub_vx_b, void, ptr, ptr, tl, ptr, env, i32) DEF_HELPER_6(vssub_vx_h, void, ptr, ptr, tl, ptr, env, i32) DEF_HELPER_6(vssub_vx_w, void, ptr, ptr, tl, ptr, env, i32) DEF_HELPER_6(vssub_vx_d, void, ptr, ptr, tl, ptr, env, i32) + +DEF_HELPER_6(vaadd_vv_b, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vaadd_vv_h, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vaadd_vv_w, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vaadd_vv_d, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vasub_vv_b, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vasub_vv_h, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vasub_vv_w, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vasub_vv_d, void, ptr, ptr, ptr, ptr, env, i32) +DEF_HELPER_6(vaadd_vx_b, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vaadd_vx_h, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vaadd_vx_w, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vaadd_vx_d, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vasub_vx_b, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vasub_vx_h, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vasub_vx_w, void, ptr, ptr, tl, ptr, env, i32) +DEF_HELPER_6(vasub_vx_d, void, ptr, ptr, tl, ptr, env, i32) diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode index c9a4050adc..e617d7bd60 100644 --- a/target/riscv/insn32.decode +++ b/target/riscv/insn32.decode @@ -417,6 +417,11 @@ vssubu_vv 100010 . ..... ..... 000 ..... 1010111 @r_vm vssubu_vx 100010 . ..... ..... 100 ..... 1010111 @r_vm vssub_vv 100011 . ..... ..... 000 ..... 1010111 @r_vm vssub_vx 100011 . ..... ..... 100 ..... 1010111 @r_vm +vaadd_vv 100100 . ..... ..... 000 ..... 1010111 @r_vm +vaadd_vx 100100 . ..... ..... 100 ..... 1010111 @r_vm +vaadd_vi 100100 . ..... ..... 011 ..... 1010111 @r_vm +vasub_vv 100110 . ..... ..... 000 ..... 1010111 @r_vm +vasub_vx 100110 . ..... ..... 100 ..... 1010111 @r_vm vsetvli 0 ........... ..... 111 ..... 1010111 @r2_zimm vsetvl 1000000 ..... ..... 111 ..... 1010111 @r diff --git a/target/riscv/insn_trans/trans_rvv.inc.c b/target/riscv/insn_trans/trans_rvv.inc.c index dd1a508a51..ba2e7d56f4 100644 --- a/target/riscv/insn_trans/trans_rvv.inc.c +++ b/target/riscv/insn_trans/trans_rvv.inc.c @@ -1636,3 +1636,10 @@ GEN_OPIVX_TRANS(vssubu_vx, opivx_check) GEN_OPIVX_TRANS(vssub_vx, opivx_check) GEN_OPIVI_TRANS(vsaddu_vi, 1, vsaddu_vx, opivx_check) GEN_OPIVI_TRANS(vsadd_vi, 0, vsadd_vx, opivx_check) + +/* Vector Single-Width Averaging Add and Subtract */ +GEN_OPIVV_TRANS(vaadd_vv, opivv_check) +GEN_OPIVV_TRANS(vasub_vv, opivv_check) +GEN_OPIVX_TRANS(vaadd_vx, opivx_check) +GEN_OPIVX_TRANS(vasub_vx, opivx_check) +GEN_OPIVI_TRANS(vaadd_vi, 0, vaadd_vx, opivx_check) diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index b17cac7fd4..984a8e260f 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -2488,3 +2488,103 @@ GEN_VEXT_VX_RM(vssub_vx_b, 1, 1, clearb) GEN_VEXT_VX_RM(vssub_vx_h, 2, 2, clearh) GEN_VEXT_VX_RM(vssub_vx_w, 4, 4, clearl) GEN_VEXT_VX_RM(vssub_vx_d, 8, 8, clearq) + +/* Vector Single-Width Averaging Add and Subtract */ +static inline uint8_t get_round(int vxrm, uint64_t v, uint8_t shift) +{ + uint8_t d = extract64(v, shift, 1); + uint8_t d1; + uint64_t D1, D2; + + if (shift == 0 || shift > 64) { + return 0; + } + + d1 = extract64(v, shift - 1, 1); + D1 = extract64(v, 0, shift); + if (vxrm == 0) { /* round-to-nearest-up (add +0.5 LSB) */ + return d1; + } else if (vxrm == 1) { /* round-to-nearest-even */ + if (shift > 1) { + D2 = extract64(v, 0, shift - 1); + return d1 & ((D2 != 0) | d); + } else { + return d1 & d; + } + } else if (vxrm == 3) { /* round-to-odd (OR bits into LSB, aka "jam") */ + return !d & (D1 != 0); + } + return 0; /* round-down (truncate) */ +} + +static inline int32_t aadd32(CPURISCVState *env, int vxrm, int32_t a, int32_t b) +{ + int64_t res = (int64_t)a + b; + uint8_t round = get_round(vxrm, res, 1); + + return (res >> 1) + round; +} + +static inline int64_t aadd64(CPURISCVState *env, int vxrm, int64_t a, int64_t b) +{ + int64_t res = a + b; + uint8_t round = get_round(vxrm, res, 1); + int64_t over = (res ^ a) & (res ^ b) & INT64_MIN; + + /* With signed overflow, bit 64 is inverse of bit 63. */ + return ((res >> 1) ^ over) + round; +} + +RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32) +RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32) +RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32) +RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64) +GEN_VEXT_VV_RM(vaadd_vv_b, 1, 1, clearb) +GEN_VEXT_VV_RM(vaadd_vv_h, 2, 2, clearh) +GEN_VEXT_VV_RM(vaadd_vv_w, 4, 4, clearl) +GEN_VEXT_VV_RM(vaadd_vv_d, 8, 8, clearq) + +RVVCALL(OPIVX2_RM, vaadd_vx_b, OP_SSS_B, H1, H1, aadd32) +RVVCALL(OPIVX2_RM, vaadd_vx_h, OP_SSS_H, H2, H2, aadd32) +RVVCALL(OPIVX2_RM, vaadd_vx_w, OP_SSS_W, H4, H4, aadd32) +RVVCALL(OPIVX2_RM, vaadd_vx_d, OP_SSS_D, H8, H8, aadd64) +GEN_VEXT_VX_RM(vaadd_vx_b, 1, 1, clearb) +GEN_VEXT_VX_RM(vaadd_vx_h, 2, 2, clearh) +GEN_VEXT_VX_RM(vaadd_vx_w, 4, 4, clearl) +GEN_VEXT_VX_RM(vaadd_vx_d, 8, 8, clearq) + +static inline int32_t asub32(CPURISCVState *env, int vxrm, int32_t a, int32_t b) +{ + int64_t res = (int64_t)a - b; + uint8_t round = get_round(vxrm, res, 1); + + return (res >> 1) + round; +} + +static inline int64_t asub64(CPURISCVState *env, int vxrm, int64_t a, int64_t b) +{ + int64_t res = (int64_t)a - b; + uint8_t round = get_round(vxrm, res, 1); + int64_t over = (res ^ a) & (a ^ b) & INT64_MIN; + + /* With signed overflow, bit 64 is inverse of bit 63. */ + return ((res >> 1) ^ over) + round; +} + +RVVCALL(OPIVV2_RM, vasub_vv_b, OP_SSS_B, H1, H1, H1, asub32) +RVVCALL(OPIVV2_RM, vasub_vv_h, OP_SSS_H, H2, H2, H2, asub32) +RVVCALL(OPIVV2_RM, vasub_vv_w, OP_SSS_W, H4, H4, H4, asub32) +RVVCALL(OPIVV2_RM, vasub_vv_d, OP_SSS_D, H8, H8, H8, asub64) +GEN_VEXT_VV_RM(vasub_vv_b, 1, 1, clearb) +GEN_VEXT_VV_RM(vasub_vv_h, 2, 2, clearh) +GEN_VEXT_VV_RM(vasub_vv_w, 4, 4, clearl) +GEN_VEXT_VV_RM(vasub_vv_d, 8, 8, clearq) + +RVVCALL(OPIVX2_RM, vasub_vx_b, OP_SSS_B, H1, H1, asub32) +RVVCALL(OPIVX2_RM, vasub_vx_h, OP_SSS_H, H2, H2, asub32) +RVVCALL(OPIVX2_RM, vasub_vx_w, OP_SSS_W, H4, H4, asub32) +RVVCALL(OPIVX2_RM, vasub_vx_d, OP_SSS_D, H8, H8, asub64) +GEN_VEXT_VX_RM(vasub_vx_b, 1, 1, clearb) +GEN_VEXT_VX_RM(vasub_vx_h, 2, 2, clearh) +GEN_VEXT_VX_RM(vasub_vx_w, 4, 4, clearl) +GEN_VEXT_VX_RM(vasub_vx_d, 8, 8, clearq)
Signed-off-by: LIU Zhiwei <zhiwei_liu@c-sky.com> --- target/riscv/helper.h | 17 ++++ target/riscv/insn32.decode | 5 ++ target/riscv/insn_trans/trans_rvv.inc.c | 7 ++ target/riscv/vector_helper.c | 100 ++++++++++++++++++++++++ 4 files changed, 129 insertions(+)