Message ID | 20230901093258.942357-4-c@jia.je (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Lower TCG vector ops to LSX | expand |
On 9/1/23 02:30, Jiajie Chen wrote: > Signed-off-by: Jiajie Chen <c@jia.je> > --- > tcg/loongarch64/tcg-target-con-set.h | 1 + > tcg/loongarch64/tcg-target.c.inc | 60 ++++++++++++++++++++++++++++ > 2 files changed, 61 insertions(+) Reviewed-by: Richard Henderson <richard.henderson@linaro.org> > > diff --git a/tcg/loongarch64/tcg-target-con-set.h b/tcg/loongarch64/tcg-target-con-set.h > index 37b3f80bf9..d04916db25 100644 > --- a/tcg/loongarch64/tcg-target-con-set.h > +++ b/tcg/loongarch64/tcg-target-con-set.h > @@ -31,4 +31,5 @@ C_O1_I2(r, 0, rZ) > C_O1_I2(r, rZ, ri) > C_O1_I2(r, rZ, rJ) > C_O1_I2(r, rZ, rZ) > +C_O1_I2(w, w, wJ) Notes for improvement: 'J' is a signed 32-bit immediate. > + if (const_args[2]) { > + /* > + * cmp_vec dest, src, value > + * Try vseqi/vslei/vslti > + */ > + int64_t value = sextract64(a2, 0, 8 << vece); > + if ((cond == TCG_COND_EQ || cond == TCG_COND_LE || \ > + cond == TCG_COND_LT) && (-0x10 <= value && value <= 0x0f)) { > + tcg_out32(s, encode_vdvjsk5_insn(cmp_vec_imm_insn[cond][vece], \ > + a0, a1, value)); > + break; > + } else if ((cond == TCG_COND_LEU || cond == TCG_COND_LTU) && > + (0x00 <= value && value <= 0x1f)) { > + tcg_out32(s, encode_vdvjuk5_insn(cmp_vec_imm_insn[cond][vece], \ > + a0, a1, value)); Better would be a new constraint that only matches -0x10 <= x <= 0x1f If the sign is wrong for the comparison, it can *always* be loaded with just vldi. Whereas at present, using J, > + tcg_out_dupi_vec(s, type, vece, temp_vec, a2); > + a2 = temp_vec; this may require 3 instructions (lu12i.w + ori + vreplgr2vr). By constraining the constants allowed, you allow the register allocator to see that a register is required, which may be reused for another instruction. r~
On 2023/9/2 01:24, Richard Henderson wrote: > On 9/1/23 02:30, Jiajie Chen wrote: >> Signed-off-by: Jiajie Chen <c@jia.je> >> --- >> tcg/loongarch64/tcg-target-con-set.h | 1 + >> tcg/loongarch64/tcg-target.c.inc | 60 ++++++++++++++++++++++++++++ >> 2 files changed, 61 insertions(+) > > Reviewed-by: Richard Henderson <richard.henderson@linaro.org> > > >> >> diff --git a/tcg/loongarch64/tcg-target-con-set.h >> b/tcg/loongarch64/tcg-target-con-set.h >> index 37b3f80bf9..d04916db25 100644 >> --- a/tcg/loongarch64/tcg-target-con-set.h >> +++ b/tcg/loongarch64/tcg-target-con-set.h >> @@ -31,4 +31,5 @@ C_O1_I2(r, 0, rZ) >> C_O1_I2(r, rZ, ri) >> C_O1_I2(r, rZ, rJ) >> C_O1_I2(r, rZ, rZ) >> +C_O1_I2(w, w, wJ) > > Notes for improvement: 'J' is a signed 32-bit immediate. I was wondering about the behavior of 'J' on i128 types: in tcg_target_const_match(), the argument type is int, so will the higher bits be truncated? Besides, tcg_target_const_match() does not know the vector element width. > >> + if (const_args[2]) { >> + /* >> + * cmp_vec dest, src, value >> + * Try vseqi/vslei/vslti >> + */ >> + int64_t value = sextract64(a2, 0, 8 << vece); >> + if ((cond == TCG_COND_EQ || cond == TCG_COND_LE || \ >> + cond == TCG_COND_LT) && (-0x10 <= value && value <= >> 0x0f)) { >> + tcg_out32(s, >> encode_vdvjsk5_insn(cmp_vec_imm_insn[cond][vece], \ >> + a0, a1, value)); >> + break; >> + } else if ((cond == TCG_COND_LEU || cond == >> TCG_COND_LTU) && >> + (0x00 <= value && value <= 0x1f)) { >> + tcg_out32(s, >> encode_vdvjuk5_insn(cmp_vec_imm_insn[cond][vece], \ >> + a0, a1, value)); > > Better would be a new constraint that only matches > > -0x10 <= x <= 0x1f > > If the sign is wrong for the comparison, it can *always* be loaded > with just vldi. > > Whereas at present, using J, > >> + tcg_out_dupi_vec(s, type, vece, temp_vec, a2); >> + a2 = temp_vec; > > this may require 3 instructions (lu12i.w + ori + vreplgr2vr). > > By constraining the constants allowed, you allow the register > allocator to see that a register is required, which may be reused for > another instruction. > > > r~
On 9/1/23 10:28, Jiajie Chen wrote: > > On 2023/9/2 01:24, Richard Henderson wrote: >> On 9/1/23 02:30, Jiajie Chen wrote: >>> Signed-off-by: Jiajie Chen <c@jia.je> >>> --- >>> tcg/loongarch64/tcg-target-con-set.h | 1 + >>> tcg/loongarch64/tcg-target.c.inc | 60 ++++++++++++++++++++++++++++ >>> 2 files changed, 61 insertions(+) >> >> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> >> >> >>> >>> diff --git a/tcg/loongarch64/tcg-target-con-set.h b/tcg/loongarch64/tcg-target-con-set.h >>> index 37b3f80bf9..d04916db25 100644 >>> --- a/tcg/loongarch64/tcg-target-con-set.h >>> +++ b/tcg/loongarch64/tcg-target-con-set.h >>> @@ -31,4 +31,5 @@ C_O1_I2(r, 0, rZ) >>> C_O1_I2(r, rZ, ri) >>> C_O1_I2(r, rZ, rJ) >>> C_O1_I2(r, rZ, rZ) >>> +C_O1_I2(w, w, wJ) >> >> Notes for improvement: 'J' is a signed 32-bit immediate. > > > I was wondering about the behavior of 'J' on i128 types: in tcg_target_const_match(), the > argument type is int, so will the higher bits be truncated? The argument is int64_t val. The only constants that we allow for vectors are dupi, so all higher parts are the same as the lower part. > Besides, tcg_target_const_match() does not know the vector element width. No, it hadn't been required so far -- there are very few vector instructions that allow immediates. r~
On 2023/9/2 01:48, Richard Henderson wrote: > On 9/1/23 10:28, Jiajie Chen wrote: >> >> On 2023/9/2 01:24, Richard Henderson wrote: >>> On 9/1/23 02:30, Jiajie Chen wrote: >>>> Signed-off-by: Jiajie Chen <c@jia.je> >>>> --- >>>> tcg/loongarch64/tcg-target-con-set.h | 1 + >>>> tcg/loongarch64/tcg-target.c.inc | 60 >>>> ++++++++++++++++++++++++++++ >>>> 2 files changed, 61 insertions(+) >>> >>> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> >>> >>> >>>> >>>> diff --git a/tcg/loongarch64/tcg-target-con-set.h >>>> b/tcg/loongarch64/tcg-target-con-set.h >>>> index 37b3f80bf9..d04916db25 100644 >>>> --- a/tcg/loongarch64/tcg-target-con-set.h >>>> +++ b/tcg/loongarch64/tcg-target-con-set.h >>>> @@ -31,4 +31,5 @@ C_O1_I2(r, 0, rZ) >>>> C_O1_I2(r, rZ, ri) >>>> C_O1_I2(r, rZ, rJ) >>>> C_O1_I2(r, rZ, rZ) >>>> +C_O1_I2(w, w, wJ) >>> >>> Notes for improvement: 'J' is a signed 32-bit immediate. >> >> >> I was wondering about the behavior of 'J' on i128 types: in >> tcg_target_const_match(), the argument type is int, so will the >> higher bits be truncated? > > The argument is int64_t val. > > The only constants that we allow for vectors are dupi, so all higher > parts are the same as the lower part. Consider the following scenario: cmp_vec v128,e32,tmp4,tmp3,v128$0xffffffffffffffff cmp_vec v128,e32,tmp4,tmp3,v128$0xfffffffefffffffe cmp_vec v128,e8,tmp4,tmp3,v128$0xfefefefefefefefe When matching constant constraint, the vector element width is unknown, so it cannot decide whether 0xfefefefefefefefe means e8 0xfe or e16 0xfefe. > >> Besides, tcg_target_const_match() does not know the vector element >> width. > > No, it hadn't been required so far -- there are very few vector > instructions that allow immediates. > > > r~
diff --git a/tcg/loongarch64/tcg-target-con-set.h b/tcg/loongarch64/tcg-target-con-set.h index 37b3f80bf9..d04916db25 100644 --- a/tcg/loongarch64/tcg-target-con-set.h +++ b/tcg/loongarch64/tcg-target-con-set.h @@ -31,4 +31,5 @@ C_O1_I2(r, 0, rZ) C_O1_I2(r, rZ, ri) C_O1_I2(r, rZ, rJ) C_O1_I2(r, rZ, rZ) +C_O1_I2(w, w, wJ) C_O1_I4(r, rZ, rJ, rZ, rZ) diff --git a/tcg/loongarch64/tcg-target.c.inc b/tcg/loongarch64/tcg-target.c.inc index 150278e112..18fe5fc148 100644 --- a/tcg/loongarch64/tcg-target.c.inc +++ b/tcg/loongarch64/tcg-target.c.inc @@ -1624,6 +1624,23 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, TCGType type = vecl + TCG_TYPE_V64; TCGArg a0, a1, a2; TCGReg temp = TCG_REG_TMP0; + TCGReg temp_vec = TCG_VEC_TMP0; + + static const LoongArchInsn cmp_vec_insn[16][4] = { + [TCG_COND_EQ] = {OPC_VSEQ_B, OPC_VSEQ_H, OPC_VSEQ_W, OPC_VSEQ_D}, + [TCG_COND_LE] = {OPC_VSLE_B, OPC_VSLE_H, OPC_VSLE_W, OPC_VSLE_D}, + [TCG_COND_LEU] = {OPC_VSLE_BU, OPC_VSLE_HU, OPC_VSLE_WU, OPC_VSLE_DU}, + [TCG_COND_LT] = {OPC_VSLT_B, OPC_VSLT_H, OPC_VSLT_W, OPC_VSLT_D}, + [TCG_COND_LTU] = {OPC_VSLT_BU, OPC_VSLT_HU, OPC_VSLT_WU, OPC_VSLT_DU}, + }; + static const LoongArchInsn cmp_vec_imm_insn[16][4] = { + [TCG_COND_EQ] = {OPC_VSEQI_B, OPC_VSEQI_H, OPC_VSEQI_W, OPC_VSEQI_D}, + [TCG_COND_LE] = {OPC_VSLEI_B, OPC_VSLEI_H, OPC_VSLEI_W, OPC_VSLEI_D}, + [TCG_COND_LEU] = {OPC_VSLEI_BU, OPC_VSLEI_HU, OPC_VSLEI_WU, OPC_VSLEI_DU}, + [TCG_COND_LT] = {OPC_VSLTI_B, OPC_VSLTI_H, OPC_VSLTI_W, OPC_VSLTI_D}, + [TCG_COND_LTU] = {OPC_VSLTI_BU, OPC_VSLTI_HU, OPC_VSLTI_WU, OPC_VSLTI_DU}, + }; + LoongArchInsn insn; a0 = args[0]; a1 = args[1]; @@ -1651,6 +1668,45 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc, tcg_out_opc_vldx(s, a0, a1, temp); } break; + case INDEX_op_cmp_vec: + TCGCond cond = args[3]; + if (const_args[2]) { + /* + * cmp_vec dest, src, value + * Try vseqi/vslei/vslti + */ + int64_t value = sextract64(a2, 0, 8 << vece); + if ((cond == TCG_COND_EQ || cond == TCG_COND_LE || \ + cond == TCG_COND_LT) && (-0x10 <= value && value <= 0x0f)) { + tcg_out32(s, encode_vdvjsk5_insn(cmp_vec_imm_insn[cond][vece], \ + a0, a1, value)); + break; + } else if ((cond == TCG_COND_LEU || cond == TCG_COND_LTU) && + (0x00 <= value && value <= 0x1f)) { + tcg_out32(s, encode_vdvjuk5_insn(cmp_vec_imm_insn[cond][vece], \ + a0, a1, value)); + break; + } + + /* + * Fallback to: + * dupi_vec temp, a2 + * cmp_vec a0, a1, temp, cond + */ + tcg_out_dupi_vec(s, type, vece, temp_vec, a2); + a2 = temp_vec; + } + + insn = cmp_vec_insn[cond][vece]; + if (insn == 0) { + TCGArg t; + t = a1, a1 = a2, a2 = t; + cond = tcg_swap_cond(cond); + insn = cmp_vec_insn[cond][vece]; + tcg_debug_assert(insn != 0); + } + tcg_out32(s, encode_vdvjvk_insn(insn, a0, a1, a2)); + break; case INDEX_op_dupm_vec: tcg_out_dupm_vec(s, type, vece, a0, a1, a2); break; @@ -1666,6 +1722,7 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece) case INDEX_op_st_vec: case INDEX_op_dup_vec: case INDEX_op_dupm_vec: + case INDEX_op_cmp_vec: return 1; default: return 0; @@ -1827,6 +1884,9 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op) case INDEX_op_st_vec: return C_O0_I2(w, r); + case INDEX_op_cmp_vec: + return C_O1_I2(w, w, wJ); + default: g_assert_not_reached(); }
Signed-off-by: Jiajie Chen <c@jia.je> --- tcg/loongarch64/tcg-target-con-set.h | 1 + tcg/loongarch64/tcg-target.c.inc | 60 ++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+)