diff mbox series

[v4,46/48] target/loongarch: Implement xvshuf xvperm{i} xvshuf4i xvextrins

Message ID 20230830084902.2113960-47-gaosong@loongson.cn (mailing list archive)
State New, archived
Headers show
Series Add LoongArch LASX instructions | expand

Commit Message

Song Gao Aug. 30, 2023, 8:49 a.m. UTC
This patch includes:
- XVSHUF.{B/H/W/D};
- XVPERM.W;
- XVSHUF4i.{B/H/W/D};
- XVPERMI.{W/D/Q};
- XVEXTRINS.{B/H/W/D}.

Signed-off-by: Song Gao <gaosong@loongson.cn>
---
 target/loongarch/helper.h                    |   3 +
 target/loongarch/vec.h                       |   2 +
 target/loongarch/insns.decode                |  21 ++++
 target/loongarch/disas.c                     |  21 ++++
 target/loongarch/vec_helper.c                | 112 +++++++++++++++----
 target/loongarch/insn_trans/trans_lasx.c.inc |  21 ++++
 6 files changed, 161 insertions(+), 19 deletions(-)
diff mbox series

Patch

diff --git a/target/loongarch/helper.h b/target/loongarch/helper.h
index fb489dda2d..b3b64a0215 100644
--- a/target/loongarch/helper.h
+++ b/target/loongarch/helper.h
@@ -709,7 +709,10 @@  DEF_HELPER_FLAGS_4(vshuf4i_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(vshuf4i_w, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(vshuf4i_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 
+DEF_HELPER_FLAGS_4(vperm_w, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(vpermi_w, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(vpermi_d, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
+DEF_HELPER_FLAGS_4(vpermi_q, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 
 DEF_HELPER_FLAGS_4(vextrins_b, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
 DEF_HELPER_FLAGS_4(vextrins_h, TCG_CALL_NO_RWG, void, ptr, ptr, i64, i32)
diff --git a/target/loongarch/vec.h b/target/loongarch/vec.h
index bc74effb7c..61e5b69c1e 100644
--- a/target/loongarch/vec.h
+++ b/target/loongarch/vec.h
@@ -93,4 +93,6 @@ 
 #define VSLE(a, b) (a <= b ? -1 : 0)
 #define VSLT(a, b) (a < b ? -1 : 0)
 
+#define SHF_POS(i, imm) (((i) & 0xfc) + (((imm) >> (2 * ((i) & 0x03))) & 0x03))
+
 #endif /* LOONGARCH_VEC_H */
diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
index a325b861c1..64b67ee9ac 100644
--- a/target/loongarch/insns.decode
+++ b/target/loongarch/insns.decode
@@ -2039,3 +2039,24 @@  xvilvh_b         0111 01010001 11000 ..... ..... .....    @vvv
 xvilvh_h         0111 01010001 11001 ..... ..... .....    @vvv
 xvilvh_w         0111 01010001 11010 ..... ..... .....    @vvv
 xvilvh_d         0111 01010001 11011 ..... ..... .....    @vvv
+
+xvshuf_b         0000 11010110 ..... ..... ..... .....    @vvvv
+xvshuf_h         0111 01010111 10101 ..... ..... .....    @vvv
+xvshuf_w         0111 01010111 10110 ..... ..... .....    @vvv
+xvshuf_d         0111 01010111 10111 ..... ..... .....    @vvv
+
+xvperm_w         0111 01010111 11010 ..... ..... .....    @vvv
+
+xvshuf4i_b       0111 01111001 00 ........ ..... .....    @vv_ui8
+xvshuf4i_h       0111 01111001 01 ........ ..... .....    @vv_ui8
+xvshuf4i_w       0111 01111001 10 ........ ..... .....    @vv_ui8
+xvshuf4i_d       0111 01111001 11 ........ ..... .....    @vv_ui8
+
+xvpermi_w        0111 01111110 01 ........ ..... .....    @vv_ui8
+xvpermi_d        0111 01111110 10 ........ ..... .....    @vv_ui8
+xvpermi_q        0111 01111110 11 ........ ..... .....    @vv_ui8
+
+xvextrins_d      0111 01111000 00 ........ ..... .....    @vv_ui8
+xvextrins_w      0111 01111000 01 ........ ..... .....    @vv_ui8
+xvextrins_h      0111 01111000 10 ........ ..... .....    @vv_ui8
+xvextrins_b      0111 01111000 11 ........ ..... .....    @vv_ui8
diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index 9b6a07bbb0..a518c59772 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -2575,3 +2575,24 @@  INSN_LASX(xvilvh_b,          vvv)
 INSN_LASX(xvilvh_h,          vvv)
 INSN_LASX(xvilvh_w,          vvv)
 INSN_LASX(xvilvh_d,          vvv)
+
+INSN_LASX(xvshuf_b,          vvvv)
+INSN_LASX(xvshuf_h,          vvv)
+INSN_LASX(xvshuf_w,          vvv)
+INSN_LASX(xvshuf_d,          vvv)
+
+INSN_LASX(xvperm_w,          vvv)
+
+INSN_LASX(xvshuf4i_b,        vv_i)
+INSN_LASX(xvshuf4i_h,        vv_i)
+INSN_LASX(xvshuf4i_w,        vv_i)
+INSN_LASX(xvshuf4i_d,        vv_i)
+
+INSN_LASX(xvpermi_w,         vv_i)
+INSN_LASX(xvpermi_d,         vv_i)
+INSN_LASX(xvpermi_q,         vv_i)
+
+INSN_LASX(xvextrins_d,       vv_i)
+INSN_LASX(xvextrins_w,       vv_i)
+INSN_LASX(xvextrins_h,       vv_i)
+INSN_LASX(xvextrins_b,       vv_i)
diff --git a/target/loongarch/vec_helper.c b/target/loongarch/vec_helper.c
index 34be19891a..97058ac2b3 100644
--- a/target/loongarch/vec_helper.c
+++ b/target/loongarch/vec_helper.c
@@ -3261,17 +3261,24 @@  VILVH(vilvh_d, 128, D)
 void HELPER(vshuf_b)(void *vd, void *vj, void *vk, void *va, uint32_t desc)
 {
     int i, m;
-    VReg temp;
+    VReg temp = {};
     VReg *Vd = (VReg *)vd;
     VReg *Vj = (VReg *)vj;
     VReg *Vk = (VReg *)vk;
     VReg *Va = (VReg *)va;
+    int oprsz = simd_oprsz(desc);
 
-    m = LSX_LEN/8;
-    for (i = 0; i < m ; i++) {
+    m = LSX_LEN / 8;
+    for (i = 0; i < m; i++) {
         uint64_t k = (uint8_t)Va->B(i) % (2 * m);
         temp.B(i) = k < m ? Vk->B(k) : Vj->B(k - m);
     }
+    if (oprsz == 32) {
+        for(i = m; i < 2 * m; i++) {
+            uint64_t j = (uint8_t)Va->B(i) % (2 * m);
+            temp.B(i) = j < m ? Vk->B(j + m) : Vj->B(j);
+        }
+    }
     *Vd = temp;
 }
 
@@ -3279,16 +3286,23 @@  void HELPER(vshuf_b)(void *vd, void *vj, void *vk, void *va, uint32_t desc)
 void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \
 {                                                              \
     int i, m;                                                  \
-    VReg temp;                                                 \
+    VReg temp = {};                                            \
     VReg *Vd = (VReg *)vd;                                     \
     VReg *Vj = (VReg *)vj;                                     \
     VReg *Vk = (VReg *)vk;                                     \
+    int oprsz = simd_oprsz(desc);                              \
                                                                \
-    m = LSX_LEN/BIT;                                           \
+    m = LSX_LEN / BIT;                                         \
     for (i = 0; i < m; i++) {                                  \
-        uint64_t k  = ((uint8_t) Vd->E(i)) % (2 * m);          \
+        uint64_t k  = (uint8_t)Vd->E(i) % (2 * m);             \
         temp.E(i) = k < m ? Vk->E(k) : Vj->E(k - m);           \
     }                                                          \
+    if (oprsz == 32) {                                         \
+        for (i = m; i < 2 * m; i++) {                          \
+            uint64_t j = (uint8_t)Vd->E(i) % (2 * m);          \
+            temp.E(i) = j < m ? Vk->E(j + m): Vj->E(j);        \
+        }                                                      \
+    }                                                          \
     *Vd = temp;                                                \
 }
 
@@ -3299,14 +3313,20 @@  VSHUF(vshuf_d, 64, D)
 #define VSHUF4I(NAME, BIT, E)                                      \
 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
 {                                                                  \
-    int i;                                                         \
-    VReg temp;                                                     \
+    int i, max;                                                    \
+    VReg temp = {};                                                \
     VReg *Vd = (VReg *)vd;                                         \
     VReg *Vj = (VReg *)vj;                                         \
+    int oprsz = simd_oprsz(desc);                                  \
                                                                    \
-    for (i = 0; i < LSX_LEN/BIT; i++) {                            \
-         temp.E(i) = Vj->E(((i) & 0xfc) + (((imm) >>               \
-                           (2 * ((i) & 0x03))) & 0x03));           \
+    max = LSX_LEN / BIT;                                           \
+    for (i = 0; i < max; i++) {                                    \
+        temp.E(i) = Vj->E(SHF_POS(i, imm));                        \
+    }                                                              \
+    if (oprsz == 32) {                                             \
+        for (i = max; i < 2 * max; i++) {                          \
+            temp.E(i) = Vj->E(SHF_POS(i - max, imm) + max);        \
+        }                                                          \
     }                                                              \
     *Vd = temp;                                                    \
 }
@@ -3317,38 +3337,92 @@  VSHUF4I(vshuf4i_w, 32, W)
 
 void HELPER(vshuf4i_d)(void *vd, void *vj, uint64_t imm, uint32_t desc)
 {
+    int i;
+    VReg temp = {};
     VReg *Vd = (VReg *)vd;
     VReg *Vj = (VReg *)vj;
+    int oprsz = simd_oprsz(desc);
 
-    VReg temp;
-    temp.D(0) = (imm & 2 ? Vj : Vd)->D(imm & 1);
-    temp.D(1) = (imm & 8 ? Vj : Vd)->D((imm >> 2) & 1);
+    for (i = 0; i < oprsz / 16; i++) {
+        temp.D(2 * i) = (imm & 2 ? Vj : Vd)->D((imm & 1) + 2 * i);
+        temp.D(2 * i + 1) = (imm & 8 ? Vj : Vd)->D(((imm >> 2) & 1) + 2 * i);
+    }
+    *Vd = temp;
+}
+
+void HELPER(vperm_w)(void *vd, void *vj, void *vk, uint32_t desc)
+{
+    int i, m;
+    VReg temp = {};
+    VReg *Vd = (VReg *)vd;
+    VReg *Vj = (VReg *)vj;
+    VReg *Vk = (VReg *)vk;
+
+    m = LASX_LEN / 32;
+    for (i = 0; i < m ; i++) {
+        uint64_t k = (uint8_t)Vk->W(i) % 8;
+        temp.W(i) = Vj->W(k);
+    }
     *Vd = temp;
 }
 
 void HELPER(vpermi_w)(void *vd, void *vj, uint64_t imm, uint32_t desc)
+{
+    int i;
+    VReg temp = {};
+    VReg *Vd = (VReg *)vd;
+    VReg *Vj = (VReg *)vj;
+    int oprsz = simd_oprsz(desc);
+
+    for (i = 0; i < oprsz / 16; i++) {
+        temp.W(4 * i) = Vj->W((imm & 0x3) + 4 * i);
+        temp.W(4 * i + 1) = Vj->W(((imm >> 2) & 0x3) + 4 * i);
+        temp.W(4 * i + 2) = Vd->W(((imm >> 4) & 0x3) + 4 * i);
+        temp.W(4 * i + 3) = Vd->W(((imm >> 6) & 0x3) + 4 * i);
+    }
+    *Vd = temp;
+}
+
+void HELPER(vpermi_d)(void *vd, void *vj, uint64_t imm, uint32_t desc)
+{
+    VReg temp = {};
+    VReg *Vd = (VReg *)vd;
+    VReg *Vj = (VReg *)vj;
+
+    temp.D(0) = Vj->D(imm & 0x3);
+    temp.D(1) = Vj->D((imm >> 2) & 0x3);
+    temp.D(2) = Vj->D((imm >> 4) & 0x3);
+    temp.D(3) = Vj->D((imm >> 6) & 0x3);
+    *Vd = temp;
+}
+
+void HELPER(vpermi_q)(void *vd, void *vj, uint64_t imm, uint32_t desc)
 {
     VReg temp;
     VReg *Vd = (VReg *)vd;
     VReg *Vj = (VReg *)vj;
 
-    temp.W(0) = Vj->W(imm & 0x3);
-    temp.W(1) = Vj->W((imm >> 2) & 0x3);
-    temp.W(2) = Vd->W((imm >> 4) & 0x3);
-    temp.W(3) = Vd->W((imm >> 6) & 0x3);
+    temp.Q(0) = (imm & 0x3) > 1 ? Vd->Q((imm & 0x3) - 2) : Vj->Q(imm & 0x3);
+    temp.Q(1) = ((imm >> 4) & 0x3) > 1 ? Vd->Q(((imm >> 4) & 0x3) - 2) :
+                                         Vj->Q((imm >> 4) & 0x3);
     *Vd = temp;
 }
 
 #define VEXTRINS(NAME, BIT, E, MASK)                               \
 void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \
 {                                                                  \
-    int ins, extr;                                                 \
+    int ins, extr, max;                                            \
     VReg *Vd = (VReg *)vd;                                         \
     VReg *Vj = (VReg *)vj;                                         \
+    int oprsz = simd_oprsz(desc);                                  \
                                                                    \
+    max = LSX_LEN / BIT;                                           \
     ins = (imm >> 4) & MASK;                                       \
     extr = imm & MASK;                                             \
     Vd->E(ins) = Vj->E(extr);                                      \
+    if (oprsz == 32) {                                             \
+        Vd->E(ins + max) = Vj->E(extr + max);                      \
+    }                                                              \
 }
 
 VEXTRINS(vextrins_b, 8, B, 0xf)
diff --git a/target/loongarch/insn_trans/trans_lasx.c.inc b/target/loongarch/insn_trans/trans_lasx.c.inc
index aa374f3a00..ebbbc5a6bb 100644
--- a/target/loongarch/insn_trans/trans_lasx.c.inc
+++ b/target/loongarch/insn_trans/trans_lasx.c.inc
@@ -951,3 +951,24 @@  TRANS(xvilvh_b, LASX, gen_vvv, 32, gen_helper_vilvh_b)
 TRANS(xvilvh_h, LASX, gen_vvv, 32, gen_helper_vilvh_h)
 TRANS(xvilvh_w, LASX, gen_vvv, 32, gen_helper_vilvh_w)
 TRANS(xvilvh_d, LASX, gen_vvv, 32, gen_helper_vilvh_d)
+
+TRANS(xvshuf_b, LASX, gen_vvvv, 32, gen_helper_vshuf_b)
+TRANS(xvshuf_h, LASX, gen_vvv, 32, gen_helper_vshuf_h)
+TRANS(xvshuf_w, LASX, gen_vvv, 32, gen_helper_vshuf_w)
+TRANS(xvshuf_d, LASX, gen_vvv, 32, gen_helper_vshuf_d)
+
+TRANS(xvperm_w, LASX, gen_vvv, 32,  gen_helper_vperm_w)
+
+TRANS(xvshuf4i_b, LASX, gen_vv_i, 32, gen_helper_vshuf4i_b)
+TRANS(xvshuf4i_h, LASX, gen_vv_i, 32, gen_helper_vshuf4i_h)
+TRANS(xvshuf4i_w, LASX, gen_vv_i, 32, gen_helper_vshuf4i_w)
+TRANS(xvshuf4i_d, LASX, gen_vv_i, 32, gen_helper_vshuf4i_d)
+
+TRANS(xvpermi_w, LASX, gen_vv_i, 32, gen_helper_vpermi_w)
+TRANS(xvpermi_d, LASX, gen_vv_i, 32, gen_helper_vpermi_d)
+TRANS(xvpermi_q, LASX, gen_vv_i, 32, gen_helper_vpermi_q)
+
+TRANS(xvextrins_b, LASX, gen_vv_i, 32, gen_helper_vextrins_b)
+TRANS(xvextrins_h, LASX, gen_vv_i, 32, gen_helper_vextrins_h)
+TRANS(xvextrins_w, LASX, gen_vv_i, 32, gen_helper_vextrins_w)
+TRANS(xvextrins_d, LASX, gen_vv_i, 32, gen_helper_vextrins_d)