Message ID | 20240813113436.831-6-zhiwei_liu@linux.alibaba.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | tcg/riscv: Add support for vector | expand |
On 8/13/24 21:34, LIU Zhiwei wrote: > From: TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com> > > In RISC-V, vector operations require initial configuration using > the vset{i}vl{i} instruction. > > This instruction: > 1. Sets the vector length (vl) in bytes > 2. Configures the vtype register, which includes: > SEW (Single Element Width) > LMUL (vector register group multiplier) > Other vector operation parameters > > This configuration is crucial for defining subsequent vector > operation behavior. To optimize performance, the configuration > process is managed dynamically: > 1. Reconfiguration using vset{i}vl{i} is necessary when SEW > or vector register group width changes. > 2. The vset instruction can be omitted when configuration > remains unchanged. > > This optimization is only effective within a single TB. > Each TB requires reconfiguration at its start, as the current > state cannot be obtained from hardware. > > Signed-off-by: TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com> > Signed-off-by: Weiwei Li <liwei1518@gmail.com> > Reviewed-by: Liu Zhiwei <zhiwei_liu@linux.alibaba.com> > --- > tcg/riscv/tcg-target.c.inc | 121 +++++++++++++++++++++++++++++++++++++ > 1 file changed, 121 insertions(+) > > diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc > index ca9bafcb3c..d17f523187 100644 > --- a/tcg/riscv/tcg-target.c.inc > +++ b/tcg/riscv/tcg-target.c.inc > @@ -167,6 +167,18 @@ static bool tcg_target_const_match(int64_t val, int ct, > * RISC-V Base ISA opcodes (IM) > */ > > +#define V_OPIVV (0x0 << 12) > +#define V_OPFVV (0x1 << 12) > +#define V_OPMVV (0x2 << 12) > +#define V_OPIVI (0x3 << 12) > +#define V_OPIVX (0x4 << 12) > +#define V_OPFVF (0x5 << 12) > +#define V_OPMVX (0x6 << 12) > +#define V_OPCFG (0x7 << 12) > + > +#define V_SUMOP (0x0 << 20) > +#define V_LUMOP (0x0 << 20) > + > typedef enum { > OPC_ADD = 0x33, > OPC_ADDI = 0x13, > @@ -262,6 +274,11 @@ typedef enum { > /* Zicond: integer conditional operations */ > OPC_CZERO_EQZ = 0x0e005033, > OPC_CZERO_NEZ = 0x0e007033, > + > + /* V: Vector extension 1.0 */ > + OPC_VSETVLI = 0x57 | V_OPCFG, > + OPC_VSETIVLI = 0xc0000057 | V_OPCFG, > + OPC_VSETVL = 0x80000057 | V_OPCFG, > } RISCVInsn; > > /* > @@ -354,6 +371,42 @@ static int32_t encode_uj(RISCVInsn opc, TCGReg rd, uint32_t imm) > return opc | (rd & 0x1f) << 7 | encode_ujimm20(imm); > } > > +typedef enum { > + VTA_TU = 0, > + VTA_TA, > +} RISCVVta; > + > +typedef enum { > + VMA_MU = 0, > + VMA_MA, > +} RISCVVma; > + > +typedef enum { > + VSEW_E8 = 0, /* EW=8b */ > + VSEW_E16, /* EW=16b */ > + VSEW_E32, /* EW=32b */ > + VSEW_E64, /* EW=64b */ > +} RISCVVsew; This exactly aligns with MemOp and vece. Do we really need an enum for this? > + > +typedef enum { > + VLMUL_M1 = 0, /* LMUL=1 */ > + VLMUL_M2, /* LMUL=2 */ > + VLMUL_M4, /* LMUL=4 */ > + VLMUL_M8, /* LMUL=8 */ > + VLMUL_RESERVED, > + VLMUL_MF8, /* LMUL=1/8 */ > + VLMUL_MF4, /* LMUL=1/4 */ > + VLMUL_MF2, /* LMUL=1/2 */ > +} RISCVVlmul; > +#define LMUL_MAX 8 > + > +static int32_t encode_vtype(RISCVVta vta, RISCVVma vma, > + RISCVVsew vsew, RISCVVlmul vlmul) > +{ > + return (vma & 0x1) << 7 | (vta & 0x1) << 6 | (vsew & 0x7) << 3 | > + (vlmul & 0x7); > +} > +static void tcg_out_vsetvl(TCGContext *s, uint32_t avl, RISCVVta vta, > + RISCVVma vma, RISCVVsew vsew, > + RISCVVlmul vlmul) > +{ > + int32_t vtypei = encode_vtype(vta, vma, vsew, vlmul); > + > + if (avl < 32) { > + tcg_out_opc_vconfig(s, OPC_VSETIVLI, TCG_REG_ZERO, avl, vtypei); > + } else { > + tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, TCG_REG_ZERO, avl); > + tcg_out_opc_vconfig(s, OPC_VSETVLI, TCG_REG_ZERO, TCG_REG_TMP0, vtypei); > + } > +} > + > +/* > + * TODO: If the vtype value is not supported by the implementation, > + * then the vill bit is set in vtype, the remaining bits in > + * vtype are set to zero, and the vl register is also set to zero > + */ > + > +static __thread unsigned prev_size; > +static __thread unsigned prev_vece = MO_8; > +static __thread bool vec_vtpye_init = true; Typo in vtpye. That said, init should be redundant. I think you only need one variable here: static __thread int prev_vtype; Since any vtype < 0 is vill, the "uninitialized" value is easily -1. > +static inline void tcg_target_set_vec_config(TCGContext *s, TCGType type, > + unsigned vece) > +{ > + unsigned oprsz = get_vec_type_bytes(type); > + > + if (!vec_vtpye_init && (prev_size == oprsz && prev_vece == vece)) { > + return ; > + } int vtype = encode_vtype(...); if (vtype != prev_vtype) { prev_vtype = vtype; tcg_out_vsetvl(s, vtype); } > @@ -1914,6 +2029,11 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, > const TCGArg args[TCG_MAX_OP_ARGS], > const int const_args[TCG_MAX_OP_ARGS]) > { > + TCGType type = vecl + TCG_TYPE_V64; > + > + if (vec_vtpye_init) { > + tcg_target_set_vec_config(s, type, vece); > + } Here is perhaps too early... see patch 8 re logicals. r~
On 2024/8/14 16:24, Richard Henderson wrote: > On 8/13/24 21:34, LIU Zhiwei wrote: >> From: TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com> >> >> In RISC-V, vector operations require initial configuration using >> the vset{i}vl{i} instruction. >> >> This instruction: >> 1. Sets the vector length (vl) in bytes >> 2. Configures the vtype register, which includes: >> SEW (Single Element Width) >> LMUL (vector register group multiplier) >> Other vector operation parameters >> >> This configuration is crucial for defining subsequent vector >> operation behavior. To optimize performance, the configuration >> process is managed dynamically: >> 1. Reconfiguration using vset{i}vl{i} is necessary when SEW >> or vector register group width changes. >> 2. The vset instruction can be omitted when configuration >> remains unchanged. >> >> This optimization is only effective within a single TB. >> Each TB requires reconfiguration at its start, as the current >> state cannot be obtained from hardware. >> >> Signed-off-by: TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com> >> Signed-off-by: Weiwei Li <liwei1518@gmail.com> >> Reviewed-by: Liu Zhiwei <zhiwei_liu@linux.alibaba.com> >> --- >> tcg/riscv/tcg-target.c.inc | 121 +++++++++++++++++++++++++++++++++++++ >> 1 file changed, 121 insertions(+) >> >> diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc >> index ca9bafcb3c..d17f523187 100644 >> --- a/tcg/riscv/tcg-target.c.inc >> +++ b/tcg/riscv/tcg-target.c.inc >> @@ -167,6 +167,18 @@ static bool tcg_target_const_match(int64_t val, >> int ct, >> * RISC-V Base ISA opcodes (IM) >> */ >> +#define V_OPIVV (0x0 << 12) >> +#define V_OPFVV (0x1 << 12) >> +#define V_OPMVV (0x2 << 12) >> +#define V_OPIVI (0x3 << 12) >> +#define V_OPIVX (0x4 << 12) >> +#define V_OPFVF (0x5 << 12) >> +#define V_OPMVX (0x6 << 12) >> +#define V_OPCFG (0x7 << 12) >> + >> +#define V_SUMOP (0x0 << 20) >> +#define V_LUMOP (0x0 << 20) >> + >> typedef enum { >> OPC_ADD = 0x33, >> OPC_ADDI = 0x13, >> @@ -262,6 +274,11 @@ typedef enum { >> /* Zicond: integer conditional operations */ >> OPC_CZERO_EQZ = 0x0e005033, >> OPC_CZERO_NEZ = 0x0e007033, >> + >> + /* V: Vector extension 1.0 */ >> + OPC_VSETVLI = 0x57 | V_OPCFG, >> + OPC_VSETIVLI = 0xc0000057 | V_OPCFG, >> + OPC_VSETVL = 0x80000057 | V_OPCFG, >> } RISCVInsn; >> /* >> @@ -354,6 +371,42 @@ static int32_t encode_uj(RISCVInsn opc, TCGReg >> rd, uint32_t imm) >> return opc | (rd & 0x1f) << 7 | encode_ujimm20(imm); >> } >> +typedef enum { >> + VTA_TU = 0, >> + VTA_TA, >> +} RISCVVta; >> + >> +typedef enum { >> + VMA_MU = 0, >> + VMA_MA, >> +} RISCVVma; >> + >> +typedef enum { >> + VSEW_E8 = 0, /* EW=8b */ >> + VSEW_E16, /* EW=16b */ >> + VSEW_E32, /* EW=32b */ >> + VSEW_E64, /* EW=64b */ >> +} RISCVVsew; > > This exactly aligns with MemOp and vece. Do we really need an enum > for this? OK. We will use MemOp enum next version. > >> + >> +typedef enum { >> + VLMUL_M1 = 0, /* LMUL=1 */ >> + VLMUL_M2, /* LMUL=2 */ >> + VLMUL_M4, /* LMUL=4 */ >> + VLMUL_M8, /* LMUL=8 */ >> + VLMUL_RESERVED, >> + VLMUL_MF8, /* LMUL=1/8 */ >> + VLMUL_MF4, /* LMUL=1/4 */ >> + VLMUL_MF2, /* LMUL=1/2 */ >> +} RISCVVlmul; >> +#define LMUL_MAX 8 >> + >> +static int32_t encode_vtype(RISCVVta vta, RISCVVma vma, >> + RISCVVsew vsew, RISCVVlmul vlmul) >> +{ >> + return (vma & 0x1) << 7 | (vta & 0x1) << 6 | (vsew & 0x7) << 3 | >> + (vlmul & 0x7); >> +} > >> +static void tcg_out_vsetvl(TCGContext *s, uint32_t avl, RISCVVta vta, >> + RISCVVma vma, RISCVVsew vsew, >> + RISCVVlmul vlmul) >> +{ >> + int32_t vtypei = encode_vtype(vta, vma, vsew, vlmul); >> + >> + if (avl < 32) { >> + tcg_out_opc_vconfig(s, OPC_VSETIVLI, TCG_REG_ZERO, avl, >> vtypei); >> + } else { >> + tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, TCG_REG_ZERO, avl); >> + tcg_out_opc_vconfig(s, OPC_VSETVLI, TCG_REG_ZERO, >> TCG_REG_TMP0, vtypei); >> + } >> +} >> + >> +/* >> + * TODO: If the vtype value is not supported by the implementation, >> + * then the vill bit is set in vtype, the remaining bits in >> + * vtype are set to zero, and the vl register is also set to zero >> + */ >> + >> +static __thread unsigned prev_size; >> +static __thread unsigned prev_vece = MO_8; >> +static __thread bool vec_vtpye_init = true; > > Typo in vtpye. OK. > > That said, init should be redundant. I think you only need one > variable here: > > static __thread int prev_vtype; Agree. > > Since any vtype < 0 is vill, the "uninitialized" value is easily -1. OK. We will set it to -1 in tcg_out_tb_start. > >> +static inline void tcg_target_set_vec_config(TCGContext *s, TCGType >> type, >> + unsigned vece) >> +{ >> + unsigned oprsz = get_vec_type_bytes(type); >> + >> + if (!vec_vtpye_init && (prev_size == oprsz && prev_vece == vece)) { >> + return ; >> + } > > int vtype = encode_vtype(...); > if (vtype != prev_vtype) { > prev_vtype = vtype; > tcg_out_vsetvl(s, vtype); > } > >> @@ -1914,6 +2029,11 @@ static void tcg_out_vec_op(TCGContext *s, >> TCGOpcode opc, >> const TCGArg args[TCG_MAX_OP_ARGS], >> const int const_args[TCG_MAX_OP_ARGS]) >> { >> + TCGType type = vecl + TCG_TYPE_V64; >> + >> + if (vec_vtpye_init) { >> + tcg_target_set_vec_config(s, type, vece); >> + } > > Here is perhaps too early... see patch 8 re logicals. I guess you mean we don't have implemented any vector op, so there is no need to set vsetvl in this patch. We will postpone it do really ops need it. Thanks, Zhiwei > > > r~
On 8/19/24 11:34, LIU Zhiwei wrote: >>> @@ -1914,6 +2029,11 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, >>> const TCGArg args[TCG_MAX_OP_ARGS], >>> const int const_args[TCG_MAX_OP_ARGS]) >>> { >>> + TCGType type = vecl + TCG_TYPE_V64; >>> + >>> + if (vec_vtpye_init) { >>> + tcg_target_set_vec_config(s, type, vece); >>> + } >> >> Here is perhaps too early... see patch 8 re logicals. > > I guess you mean we don't have implemented any vector op, so there is no need to set > vsetvl in this patch. We will postpone it do really ops need it. What I meant is "too early in the function", i.e. before the switch. Per my comment in patch 8, there are some vector ops that are agnostic to type and only care about length. Thus perhaps switch (op) { case xxx: set_vec_config_len(s, type); something; case yyy: set_vec_config_len_elt(s, type, vece); something_else; ... } Or some other structure that makes sense. r~
On 2024/8/19 10:35, Richard Henderson wrote: > On 8/19/24 11:34, LIU Zhiwei wrote: >>>> @@ -1914,6 +2029,11 @@ static void tcg_out_vec_op(TCGContext *s, >>>> TCGOpcode opc, >>>> const TCGArg args[TCG_MAX_OP_ARGS], >>>> const int const_args[TCG_MAX_OP_ARGS]) >>>> { >>>> + TCGType type = vecl + TCG_TYPE_V64; >>>> + >>>> + if (vec_vtpye_init) { >>>> + tcg_target_set_vec_config(s, type, vece); >>>> + } >>> >>> Here is perhaps too early... see patch 8 re logicals. >> >> I guess you mean we don't have implemented any vector op, so there is >> no need to set vsetvl in this patch. We will postpone it do really >> ops need it. > > What I meant is "too early in the function", i.e. before the switch. > > Per my comment in patch 8, there are some vector ops that are agnostic > to type and only care about length. Thus perhaps > > switch (op) { > case xxx: > set_vec_config_len(s, type); > something; > > case yyy: > set_vec_config_len_elt(s, type, vece); > something_else; > > ... > } > > Or some other structure that makes sense. Thanks for clarify once again. It's much better to explicitly have two API types. Thanks, Zhiwei > > > r~
diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc index ca9bafcb3c..d17f523187 100644 --- a/tcg/riscv/tcg-target.c.inc +++ b/tcg/riscv/tcg-target.c.inc @@ -167,6 +167,18 @@ static bool tcg_target_const_match(int64_t val, int ct, * RISC-V Base ISA opcodes (IM) */ +#define V_OPIVV (0x0 << 12) +#define V_OPFVV (0x1 << 12) +#define V_OPMVV (0x2 << 12) +#define V_OPIVI (0x3 << 12) +#define V_OPIVX (0x4 << 12) +#define V_OPFVF (0x5 << 12) +#define V_OPMVX (0x6 << 12) +#define V_OPCFG (0x7 << 12) + +#define V_SUMOP (0x0 << 20) +#define V_LUMOP (0x0 << 20) + typedef enum { OPC_ADD = 0x33, OPC_ADDI = 0x13, @@ -262,6 +274,11 @@ typedef enum { /* Zicond: integer conditional operations */ OPC_CZERO_EQZ = 0x0e005033, OPC_CZERO_NEZ = 0x0e007033, + + /* V: Vector extension 1.0 */ + OPC_VSETVLI = 0x57 | V_OPCFG, + OPC_VSETIVLI = 0xc0000057 | V_OPCFG, + OPC_VSETVL = 0x80000057 | V_OPCFG, } RISCVInsn; /* @@ -354,6 +371,42 @@ static int32_t encode_uj(RISCVInsn opc, TCGReg rd, uint32_t imm) return opc | (rd & 0x1f) << 7 | encode_ujimm20(imm); } +typedef enum { + VTA_TU = 0, + VTA_TA, +} RISCVVta; + +typedef enum { + VMA_MU = 0, + VMA_MA, +} RISCVVma; + +typedef enum { + VSEW_E8 = 0, /* EW=8b */ + VSEW_E16, /* EW=16b */ + VSEW_E32, /* EW=32b */ + VSEW_E64, /* EW=64b */ +} RISCVVsew; + +typedef enum { + VLMUL_M1 = 0, /* LMUL=1 */ + VLMUL_M2, /* LMUL=2 */ + VLMUL_M4, /* LMUL=4 */ + VLMUL_M8, /* LMUL=8 */ + VLMUL_RESERVED, + VLMUL_MF8, /* LMUL=1/8 */ + VLMUL_MF4, /* LMUL=1/4 */ + VLMUL_MF2, /* LMUL=1/2 */ +} RISCVVlmul; +#define LMUL_MAX 8 + +static int32_t encode_vtype(RISCVVta vta, RISCVVma vma, + RISCVVsew vsew, RISCVVlmul vlmul) +{ + return (vma & 0x1) << 7 | (vta & 0x1) << 6 | (vsew & 0x7) << 3 | + (vlmul & 0x7); +} + /* * RISC-V instruction emitters */ @@ -483,6 +536,12 @@ static void tcg_out_opc_reg_vec_i(TCGContext *s, RISCVInsn opc, tcg_out32(s, encode_r(opc, rd, (imm & 0x1f), vs2) | (vm << 25)); } +static void tcg_out_opc_vec_config(TCGContext *s, RISCVInsn opc, + TCGReg rd, uint32_t avl, int32_t vtypei) +{ + tcg_out32(s, encode_i(opc, rd, avl, vtypei)); +} + /* vm=0 (vm = false) means vector masking ENABLED. */ #define tcg_out_opc_vv(s, opc, vd, vs2, vs1, vm) \ tcg_out_opc_reg_vec(s, opc, vd, vs1, vs2, vm); @@ -497,12 +556,68 @@ static void tcg_out_opc_reg_vec_i(TCGContext *s, RISCVInsn opc, #define tcg_out_opc_vi(s, opc, vd, vs2, imm, vm) \ tcg_out_opc_reg_vec_i(s, opc, vd, imm, vs2, vm); +#define tcg_out_opc_vconfig(s, opc, rd, avl, vtypei) \ + tcg_out_opc_vec_config(s, opc, rd, avl, vtypei); + /* * Only unit-stride addressing implemented; may extend in future. */ #define tcg_out_opc_ldst_vec(s, opc, vs3_vd, rs1, vm) \ tcg_out_opc_reg_vec(s, opc, vs3_vd, rs1, 0, vm); +static void tcg_out_vsetvl(TCGContext *s, uint32_t avl, RISCVVta vta, + RISCVVma vma, RISCVVsew vsew, + RISCVVlmul vlmul) +{ + int32_t vtypei = encode_vtype(vta, vma, vsew, vlmul); + + if (avl < 32) { + tcg_out_opc_vconfig(s, OPC_VSETIVLI, TCG_REG_ZERO, avl, vtypei); + } else { + tcg_out_opc_imm(s, OPC_ADDI, TCG_REG_TMP0, TCG_REG_ZERO, avl); + tcg_out_opc_vconfig(s, OPC_VSETVLI, TCG_REG_ZERO, TCG_REG_TMP0, vtypei); + } +} + +/* + * TODO: If the vtype value is not supported by the implementation, + * then the vill bit is set in vtype, the remaining bits in + * vtype are set to zero, and the vl register is also set to zero + */ + +static __thread unsigned prev_size; +static __thread unsigned prev_vece = MO_8; +static __thread bool vec_vtpye_init = true; + +#define get_vlmax(vsew) (riscv_vlen / (8 << vsew) * (LMUL_MAX)) +#define get_vec_type_bytes(type) (type >= TCG_TYPE_V64 ? \ + (8 << (type - TCG_TYPE_V64)) : 0) +#define encode_lmul(oprsz, vlenb) (ctzl(oprsz / vlenb)) + +static inline void tcg_target_set_vec_config(TCGContext *s, TCGType type, + unsigned vece) +{ + unsigned oprsz = get_vec_type_bytes(type); + + if (!vec_vtpye_init && (prev_size == oprsz && prev_vece == vece)) { + return ; + } + + RISCVVsew vsew = vece - MO_8 + VSEW_E8; + unsigned avl = oprsz / (1 << vece); + unsigned vlenb = riscv_vlen / 8; + RISCVVlmul lmul = oprsz > vlenb ? + encode_lmul(oprsz, vlenb) : VLMUL_M1; + tcg_debug_assert(avl <= get_vlmax(vsew)); + tcg_debug_assert(lmul <= VLMUL_RESERVED); + + prev_size = oprsz; + prev_vece = vece; + vec_vtpye_init = false; + tcg_out_vsetvl(s, avl, VTA_TA, VMA_MA, vsew, lmul); + return ; +} + /* * TCG intrinsics */ @@ -1914,6 +2029,11 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, const TCGArg args[TCG_MAX_OP_ARGS], const int const_args[TCG_MAX_OP_ARGS]) { + TCGType type = vecl + TCG_TYPE_V64; + + if (vec_vtpye_init) { + tcg_target_set_vec_config(s, type, vece); + } switch (opc) { case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */ case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */ @@ -2151,6 +2271,7 @@ static void tcg_target_qemu_prologue(TCGContext *s) static void tcg_out_tb_start(TCGContext *s) { + vec_vtpye_init = true; /* nothing to do */ }