[v3,06/14] tcg/riscv: Implement vector mov/dup{m/i}

Message ID	20240904142739.854-7-zhiwei_liu@linux.alibaba.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org> From: LIU Zhiwei <zhiwei_liu@linux.alibaba.com> To: qemu-devel@nongnu.org Cc: qemu-riscv@nongnu.org, palmer@dabbelt.com, alistair.francis@wdc.com, dbarboza@ventanamicro.com, liwei1518@gmail.com, bmeng.cn@gmail.com, zhiwei_liu@linux.alibaba.com, richard.henderson@linaro.org, TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com> Subject: [PATCH v3 06/14] tcg/riscv: Implement vector mov/dup{m/i} Date: Wed, 4 Sep 2024 22:27:31 +0800 Message-Id: <20240904142739.854-7-zhiwei_liu@linux.alibaba.com> In-Reply-To: <20240904142739.854-1-zhiwei_liu@linux.alibaba.com> References: <20240904142739.854-1-zhiwei_liu@linux.alibaba.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Received-SPF: pass client-ip=115.124.30.119; envelope-from=zhiwei_liu@linux.alibaba.com; helo=out30-119.freemail.mail.aliyun.com X-Spam_score_int: -174 X-Spam_score: -17.5 X-Spam_bar: ----------------- X-Spam_report: (-17.5 / 5.0 requ) BAYES_00=-1.9, DKIM_SIGNED=0.1, DKIM_VALID=-0.1, DKIM_VALID_AU=-0.1, ENV_AND_HDR_SPF_MATCH=-0.5, RCVD_IN_DNSWL_NONE=-0.0001, SPF_HELO_NONE=0.001, SPF_PASS=-0.001, T_SCC_BODY_TEXT_LINE=-0.01, UNPARSEABLE_RELAY=0.001, USER_IN_DEF_DKIM_WL=-7.5, USER_IN_DEF_SPF_WL=-7.5 autolearn=ham autolearn_force=no X-Spam_action: no action Precedence: list Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org
Series	Add support for vector \| expand [v3,00/14] Add support for vector [v3,01/14] tcg/op-gvec: Fix iteration step in 32-bit operation [v3,02/14] util: Add RISC-V vector extension probe in cpuinfo [v3,03/14] tcg/riscv: Add basic support for vector [v3,04/14] tcg/riscv: Add riscv vset{i}vli support [v3,05/14] tcg/riscv: Implement vector load/store [v3,06/14] tcg/riscv: Implement vector mov/dup{m/i} [v3,07/14] tcg/riscv: Add support for basic vector opcodes [v3,08/14] tcg/riscv: Implement vector cmp ops [v3,09/14] tcg/riscv: Implement vector neg ops [v3,10/14] tcg/riscv: Implement vector sat/mul ops [v3,11/14] tcg/riscv: Implement vector min/max ops [v3,12/14] tcg/riscv: Implement vector shs/v ops [v3,13/14] tcg/riscv: Implement vector roti/v/x shi ops [v3,14/14] tcg/riscv: Enable native vector support for TCG host

Message ID

20240904142739.854-7-zhiwei_liu@linux.alibaba.com (mailing list archive)

State

New, archived

Headers

From: LIU Zhiwei <zhiwei_liu@linux.alibaba.com>
To: qemu-devel@nongnu.org
Cc: qemu-riscv@nongnu.org, palmer@dabbelt.com, alistair.francis@wdc.com,
 dbarboza@ventanamicro.com, liwei1518@gmail.com, bmeng.cn@gmail.com,
 zhiwei_liu@linux.alibaba.com, richard.henderson@linaro.org,
 TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com>
Subject: [PATCH v3 06/14] tcg/riscv: Implement vector mov/dup{m/i}
Date: Wed,  4 Sep 2024 22:27:31 +0800
Message-Id: <20240904142739.854-7-zhiwei_liu@linux.alibaba.com>
In-Reply-To: <20240904142739.854-1-zhiwei_liu@linux.alibaba.com>
References: <20240904142739.854-1-zhiwei_liu@linux.alibaba.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Received-SPF: pass client-ip=115.124.30.119;
 envelope-from=zhiwei_liu@linux.alibaba.com;
 helo=out30-119.freemail.mail.aliyun.com
X-Spam_score_int: -174
X-Spam_score: -17.5
X-Spam_bar: -----------------
X-Spam_report: (-17.5 / 5.0 requ) BAYES_00=-1.9, DKIM_SIGNED=0.1,
 DKIM_VALID=-0.1, DKIM_VALID_AU=-0.1, ENV_AND_HDR_SPF_MATCH=-0.5,
 RCVD_IN_DNSWL_NONE=-0.0001, SPF_HELO_NONE=0.001, SPF_PASS=-0.001,
 T_SCC_BODY_TEXT_LINE=-0.01, UNPARSEABLE_RELAY=0.001,
 USER_IN_DEF_DKIM_WL=-7.5,
 USER_IN_DEF_SPF_WL=-7.5 autolearn=ham autolearn_force=no
X-Spam_action: no action
X-BeenThere: qemu-devel@nongnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: <qemu-devel.nongnu.org>
List-Unsubscribe: <https://lists.nongnu.org/mailman/options/qemu-devel>,
 <mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <https://lists.nongnu.org/archive/html/qemu-devel>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <https://lists.nongnu.org/mailman/listinfo/qemu-devel>,
 <mailto:qemu-devel-request@nongnu.org?subject=subscribe>
Errors-To: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org
Sender: qemu-devel-bounces+qemu-devel=archiver.kernel.org@nongnu.org

Series

Add support for vector | expand

Commit Message

LIU Zhiwei Sept. 4, 2024, 2:27 p.m. UTC

From: TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com>

Signed-off-by: TANG Tiancheng <tangtiancheng.ttc@alibaba-inc.com>
Reviewed-by: Liu Zhiwei <zhiwei_liu@linux.alibaba.com>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/riscv/tcg-target.c.inc | 53 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

Comments

Richard Henderson Sept. 5, 2024, 6:56 a.m. UTC | #1

On 9/4/24 07:27, LIU Zhiwei wrote:
> @@ -698,6 +704,21 @@ static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
>       case TCG_TYPE_I64:
>           tcg_out_opc_imm(s, OPC_ADDI, ret, arg, 0);
>           break;
> +    case TCG_TYPE_V64:
> +    case TCG_TYPE_V128:
> +    case TCG_TYPE_V256:
> +        {
> +            int nf = get_vec_type_bytes(type) / riscv_vlenb;
> +
> +            if (nf != 0) {
> +                tcg_debug_assert(is_power_of_2(nf) && nf <= 8);
> +                tcg_out_opc_vi(s, OPC_VMVNR_V, ret, arg, nf - 1, true);
> +            } else {
> +                riscv_set_vec_config_vl(s, type);
> +                tcg_out_opc_vv(s, OPC_VMV_V_V, ret, TCG_REG_V0, arg, true);
> +            }
> +        }
> +        break;

Perhaps

         int lmul = type - riscv_lg2_vlenb;
         int nf = 1 << MIN(lmul, 0);
         tcg_out_opc_vi(s, OPC_VMVNR_V, ret, arg, nf - 1);

Is there a reason to prefer vmv.v.v over vmvnr.v?
Seems like we can always move one vector reg...

> +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
> +                                    TCGReg dst, int64_t arg)
> +{
> +    if (arg < 16 && arg >= -16) {
> +        riscv_set_vec_config_vl_vece(s, type, vece);
> +        tcg_out_opc_vi(s, OPC_VMV_V_I, dst, TCG_REG_V0, arg, true);
> +        return;
> +    }
> +    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, arg);
> +    tcg_out_dup_vec(s, type, vece, dst, TCG_REG_TMP0);
> +}

I'll note that 0 and -1 do not require SEW change. I don't know how often that will come 
up, since in my testing with aarch64, we usually needed to swap to TCG_TYPE_V256 anyway.


r~

LIU Zhiwei Sept. 10, 2024, 1:13 a.m. UTC | #2

On 2024/9/5 14:56, Richard Henderson wrote:
> On 9/4/24 07:27, LIU Zhiwei wrote:
>> @@ -698,6 +704,21 @@ static bool tcg_out_mov(TCGContext *s, TCGType 
>> type, TCGReg ret, TCGReg arg)
>>       case TCG_TYPE_I64:
>>           tcg_out_opc_imm(s, OPC_ADDI, ret, arg, 0);
>>           break;
>> +    case TCG_TYPE_V64:
>> +    case TCG_TYPE_V128:
>> +    case TCG_TYPE_V256:
>> +        {
>> +            int nf = get_vec_type_bytes(type) / riscv_vlenb;
>> +
>> +            if (nf != 0) {
>> +                tcg_debug_assert(is_power_of_2(nf) && nf <= 8);
>> +                tcg_out_opc_vi(s, OPC_VMVNR_V, ret, arg, nf - 1, true);
>> +            } else {
>> +                riscv_set_vec_config_vl(s, type);
>> +                tcg_out_opc_vv(s, OPC_VMV_V_V, ret, TCG_REG_V0, arg, 
>> true);
>> +            }
>> +        }
>> +        break;
>
> Perhaps
>
>         int lmul = type - riscv_lg2_vlenb;
>         int nf = 1 << MIN(lmul, 0);
>         tcg_out_opc_vi(s, OPC_VMVNR_V, ret, arg, nf - 1);
>
> Is there a reason to prefer vmv.v.v over vmvnr.v?

I think it's a trade-off. For some CPUs,  instruction will be split 
internally. Thus the less the fraction lmul is, the less micro ops for 
execution.
That's the benefit of using vmv.v.v. But here we also need a vsetivli. 
On some cpus, it can be fusion-ed to the next instruction.

> Seems like we can always move one vector reg...
OK. I will take this way.
>
>> +static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned 
>> vece,
>> +                                    TCGReg dst, int64_t arg)
>> +{
>> +    if (arg < 16 && arg >= -16) {
>> +        riscv_set_vec_config_vl_vece(s, type, vece);
>> +        tcg_out_opc_vi(s, OPC_VMV_V_I, dst, TCG_REG_V0, arg, true);
>> +        return;
>> +    }
>> +    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, arg);
>> +    tcg_out_dup_vec(s, type, vece, dst, TCG_REG_TMP0);
>> +}
>
> I'll note that 0 and -1 do not require SEW change. I don't know how 
> often that will come up

On our test on OpenCV, we get a rate of 99.7%. Thus we will optimize 
this next version.

Thanks,
Zhiwei

> , since in my testing with aarch64, we usually needed to swap to 
> TCG_TYPE_V256 anyway.
>
>
> r~

diff --git a/tcg/riscv/tcg-target.c.inc b/tcg/riscv/tcg-target.c.inc
index 4b1079fc6f..ddb0c8190c 100644
--- a/tcg/riscv/tcg-target.c.inc
+++ b/tcg/riscv/tcg-target.c.inc
@@ -309,6 +309,12 @@  typedef enum {
     OPC_VS2R_V = 0x2000027 | V_UNIT_STRIDE_WHOLE_REG | V_NF(1),
     OPC_VS4R_V = 0x2000027 | V_UNIT_STRIDE_WHOLE_REG | V_NF(3),
     OPC_VS8R_V = 0x2000027 | V_UNIT_STRIDE_WHOLE_REG | V_NF(7),
+
+    OPC_VMV_V_V = 0x5e000057 | V_OPIVV,
+    OPC_VMV_V_I = 0x5e000057 | V_OPIVI,
+    OPC_VMV_V_X = 0x5e000057 | V_OPIVX,
+
+    OPC_VMVNR_V = 0x9e000057 | V_OPIVI,
 } RISCVInsn;
 
 /*
@@ -698,6 +704,21 @@  static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
     case TCG_TYPE_I64:
         tcg_out_opc_imm(s, OPC_ADDI, ret, arg, 0);
         break;
+    case TCG_TYPE_V64:
+    case TCG_TYPE_V128:
+    case TCG_TYPE_V256:
+        {
+            int nf = get_vec_type_bytes(type) / riscv_vlenb;
+
+            if (nf != 0) {
+                tcg_debug_assert(is_power_of_2(nf) && nf <= 8);
+                tcg_out_opc_vi(s, OPC_VMVNR_V, ret, arg, nf - 1, true);
+            } else {
+                riscv_set_vec_config_vl(s, type);
+                tcg_out_opc_vv(s, OPC_VMV_V_V, ret, TCG_REG_V0, arg, true);
+            }
+        }
+        break;
     default:
         g_assert_not_reached();
     }
@@ -1106,6 +1127,33 @@  static void tcg_out_addsub2(TCGContext *s,
     }
 }
 
+static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
+                                   TCGReg dst, TCGReg src)
+{
+    riscv_set_vec_config_vl_vece(s, type, vece);
+    tcg_out_opc_vx(s, OPC_VMV_V_X, dst, TCG_REG_V0, src, true);
+    return true;
+}
+
+static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
+                                    TCGReg dst, TCGReg base, intptr_t offset)
+{
+    tcg_out_ld(s, TCG_TYPE_REG, TCG_REG_TMP0, base, offset);
+    return tcg_out_dup_vec(s, type, vece, dst, TCG_REG_TMP0);
+}
+
+static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
+                                    TCGReg dst, int64_t arg)
+{
+    if (arg < 16 && arg >= -16) {
+        riscv_set_vec_config_vl_vece(s, type, vece);
+        tcg_out_opc_vi(s, OPC_VMV_V_I, dst, TCG_REG_V0, arg, true);
+        return;
+    }
+    tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, arg);
+    tcg_out_dup_vec(s, type, vece, dst, TCG_REG_TMP0);
+}
+
 static const struct {
     RISCVInsn op;
     bool swap;
@@ -2234,6 +2282,9 @@  static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
     a2 = args[2];
 
     switch (opc) {
+    case INDEX_op_dupm_vec:
+        tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
+        break;
     case INDEX_op_ld_vec:
         tcg_out_ld(s, type, a0, a1, a2);
         break;
@@ -2405,6 +2456,8 @@  static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
 
     case INDEX_op_st_vec:
         return C_O0_I2(v, r);
+    case INDEX_op_dup_vec:
+    case INDEX_op_dupm_vec:
     case INDEX_op_ld_vec:
         return C_O1_I1(v, r);
     default:

[v3,06/14] tcg/riscv: Implement vector mov/dup{m/i}

Commit Message

Comments

Patch