diff mbox series

[RFC,v4,74/75] target/i386: convert pshuf(w, lw, hw, d), shuf(pd, ps) helpers to gvec style

Message ID 20190821172951.15333-75-jan.bobek@gmail.com (mailing list archive)
State New, archived
Headers show
Series rewrite MMX/SSE*/AVX/AVX2 vector instruction translation | expand

Commit Message

Jan Bobek Aug. 21, 2019, 5:29 p.m. UTC
Make these helpers suitable for use with tcg_gen_gvec_* functions.

Signed-off-by: Jan Bobek <jan.bobek@gmail.com>
---
 target/i386/ops_sse.h        | 141 ++++++++++++++++++++++++-----------
 target/i386/ops_sse_header.h |  12 +--
 target/i386/translate.c      |  34 ++++-----
 3 files changed, 119 insertions(+), 68 deletions(-)
diff mbox series

Patch

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 8172324e34..2e50d91a25 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -551,70 +551,123 @@  void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *a, Reg *b,
 }
 
 #if SHIFT == 0
-void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
+void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *a, uint32_t desc)
 {
-    Reg r;
+    const intptr_t oprsz = simd_oprsz(desc);
+    const intptr_t maxsz = simd_maxsz(desc);
+    const uint8_t ctrl = simd_data(desc);
 
-    r.W(0) = s->W(order & 3);
-    r.W(1) = s->W((order >> 2) & 3);
-    r.W(2) = s->W((order >> 4) & 3);
-    r.W(3) = s->W((order >> 6) & 3);
-    *d = r;
+    for (intptr_t i = 0; 4 * i * sizeof(uint16_t) < oprsz; ++i) {
+        const uint16_t t0 = a->W(4 * i + ((ctrl >> 0) & 3));
+        const uint16_t t1 = a->W(4 * i + ((ctrl >> 2) & 3));
+        const uint16_t t2 = a->W(4 * i + ((ctrl >> 4) & 3));
+        const uint16_t t3 = a->W(4 * i + ((ctrl >> 6) & 3));
+
+        d->W(4 * i + 0) = t0;
+        d->W(4 * i + 1) = t1;
+        d->W(4 * i + 2) = t2;
+        d->W(4 * i + 3) = t3;
+    }
+    glue(clear_high, SUFFIX)(d, oprsz, maxsz);
 }
 #else
-void helper_shufps(Reg *d, Reg *s, int order)
+void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *a, uint32_t desc)
 {
-    Reg r;
+    const intptr_t oprsz = simd_oprsz(desc);
+    const intptr_t maxsz = simd_maxsz(desc);
+    const uint8_t ctrl = simd_data(desc);
 
-    r.L(0) = d->L(order & 3);
-    r.L(1) = d->L((order >> 2) & 3);
-    r.L(2) = s->L((order >> 4) & 3);
-    r.L(3) = s->L((order >> 6) & 3);
-    *d = r;
+    for (intptr_t i = 0; 8 * i * sizeof(uint16_t) < oprsz; ++i) {
+        const uint16_t t0 = a->W(8 * i + ((ctrl >> 0) & 3));
+        const uint16_t t1 = a->W(8 * i + ((ctrl >> 2) & 3));
+        const uint16_t t2 = a->W(8 * i + ((ctrl >> 4) & 3));
+        const uint16_t t3 = a->W(8 * i + ((ctrl >> 6) & 3));
+
+        d->W(8 * i + 0) = t0;
+        d->W(8 * i + 1) = t1;
+        d->W(8 * i + 2) = t2;
+        d->W(8 * i + 3) = t3;
+        d->Q(2 * i + 1) = a->Q(2 * i + 1);
+    }
+    glue(clear_high, SUFFIX)(d, oprsz, maxsz);
 }
 
-void helper_shufpd(Reg *d, Reg *s, int order)
+void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *a, uint32_t desc)
 {
-    Reg r;
+    const intptr_t oprsz = simd_oprsz(desc);
+    const intptr_t maxsz = simd_maxsz(desc);
+    const uint8_t ctrl = simd_data(desc);
+
+    for (intptr_t i = 0; 8 * i * sizeof(uint16_t) < oprsz; ++i) {
+        const uint16_t t0 = a->W(8 * i + 4 + ((ctrl >> 0) & 3));
+        const uint16_t t1 = a->W(8 * i + 4 + ((ctrl >> 2) & 3));
+        const uint16_t t2 = a->W(8 * i + 4 + ((ctrl >> 4) & 3));
+        const uint16_t t3 = a->W(8 * i + 4 + ((ctrl >> 6) & 3));
 
-    r.Q(0) = d->Q(order & 1);
-    r.Q(1) = s->Q((order >> 1) & 1);
-    *d = r;
+        d->Q(2 * i + 0) = a->Q(2 * i + 0);
+        d->W(8 * i + 4) = t0;
+        d->W(8 * i + 5) = t1;
+        d->W(8 * i + 6) = t2;
+        d->W(8 * i + 7) = t3;
+    }
+    glue(clear_high, SUFFIX)(d, oprsz, maxsz);
 }
 
-void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
+void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *a, uint32_t desc)
 {
-    Reg r;
+    const intptr_t oprsz = simd_oprsz(desc);
+    const intptr_t maxsz = simd_maxsz(desc);
+    const uint8_t ctrl = simd_data(desc);
+
+    for (intptr_t i = 0; 4 * i * sizeof(uint32_t) < oprsz; ++i) {
+        const uint32_t t0 = a->L(4 * i + ((ctrl >> 0) & 3));
+        const uint32_t t1 = a->L(4 * i + ((ctrl >> 2) & 3));
+        const uint32_t t2 = a->L(4 * i + ((ctrl >> 4) & 3));
+        const uint32_t t3 = a->L(4 * i + ((ctrl >> 6) & 3));
+
+        d->L(4 * i + 0) = t0;
+        d->L(4 * i + 1) = t1;
+        d->L(4 * i + 2) = t2;
+        d->L(4 * i + 3) = t3;
 
-    r.L(0) = s->L(order & 3);
-    r.L(1) = s->L((order >> 2) & 3);
-    r.L(2) = s->L((order >> 4) & 3);
-    r.L(3) = s->L((order >> 6) & 3);
-    *d = r;
+    }
+    glue(clear_high, SUFFIX)(d, oprsz, maxsz);
 }
 
-void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
+void glue(helper_shufps, SUFFIX)(Reg *d, Reg *a, Reg *b, uint32_t desc)
 {
-    Reg r;
-
-    r.W(0) = s->W(order & 3);
-    r.W(1) = s->W((order >> 2) & 3);
-    r.W(2) = s->W((order >> 4) & 3);
-    r.W(3) = s->W((order >> 6) & 3);
-    r.Q(1) = s->Q(1);
-    *d = r;
+    const intptr_t oprsz = simd_oprsz(desc);
+    const intptr_t maxsz = simd_maxsz(desc);
+    const uint8_t ctrl = simd_data(desc);
+
+    for (intptr_t i = 0; 4 * i * sizeof(uint32_t) < oprsz; ++i) {
+        const uint32_t t0 = a->L(4 * i + ((ctrl >> 0) & 3));
+        const uint32_t t1 = a->L(4 * i + ((ctrl >> 2) & 3));
+        const uint32_t t2 = b->L(4 * i + ((ctrl >> 4) & 3));
+        const uint32_t t3 = b->L(4 * i + ((ctrl >> 6) & 3));
+
+        d->W(4 * i + 0) = t0;
+        d->W(4 * i + 1) = t1;
+        d->W(4 * i + 2) = t2;
+        d->W(4 * i + 3) = t3;
+    }
+    glue(clear_high, SUFFIX)(d, oprsz, maxsz);
 }
 
-void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
+void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *a, Reg *b, uint32_t desc)
 {
-    Reg r;
-
-    r.Q(0) = s->Q(0);
-    r.W(4) = s->W(4 + (order & 3));
-    r.W(5) = s->W(4 + ((order >> 2) & 3));
-    r.W(6) = s->W(4 + ((order >> 4) & 3));
-    r.W(7) = s->W(4 + ((order >> 6) & 3));
-    *d = r;
+    const intptr_t oprsz = simd_oprsz(desc);
+    const intptr_t maxsz = simd_maxsz(desc);
+    const uint8_t ctrl = simd_data(desc);
+
+    for (intptr_t i = 0; 2 * i * sizeof(uint64_t) < oprsz; ++i) {
+        const uint64_t t0 = a->Q(2 * i + ((ctrl >> 0) & 1));
+        const uint64_t t1 = b->Q(2 * i + ((ctrl >> 1) & 1));
+
+        d->Q(2 * i + 0) = t0;
+        d->Q(2 * i + 1) = t1;
+    }
+    glue(clear_high, SUFFIX)(d, oprsz, maxsz);
 }
 #endif
 
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index ee8bd4c1af..207d41e248 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -78,13 +78,13 @@  DEF_HELPER_4(glue(psadbw, SUFFIX), void, Reg, Reg, Reg, i32)
 DEF_HELPER_4(glue(maskmov, SUFFIX), void, env, Reg, Reg, tl)
 
 #if SHIFT == 0
-DEF_HELPER_3(glue(pshufw, SUFFIX), void, Reg, Reg, int)
+DEF_HELPER_3(glue(pshufw, SUFFIX), void, Reg, Reg, i32)
 #else
-DEF_HELPER_3(shufps, void, Reg, Reg, int)
-DEF_HELPER_3(shufpd, void, Reg, Reg, int)
-DEF_HELPER_3(glue(pshufd, SUFFIX), void, Reg, Reg, int)
-DEF_HELPER_3(glue(pshuflw, SUFFIX), void, Reg, Reg, int)
-DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, int)
+DEF_HELPER_3(glue(pshuflw, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(pshufd, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_4(glue(shufps, SUFFIX), void, Reg, Reg, Reg, i32)
+DEF_HELPER_4(glue(shufpd, SUFFIX), void, Reg, Reg, Reg, i32)
 #endif
 
 #if SHIFT == 1
diff --git a/target/i386/translate.c b/target/i386/translate.c
index 3554086336..bb4120a848 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -2763,8 +2763,6 @@  static const SSEFunc_0_epp sse_op_table1[256][4] = {
     [0x5b] = { gen_helper_cvtdq2ps, gen_helper_cvtps2dq, gen_helper_cvttps2dq },
 
     [0xc2] = SSE_FOP(cmpeq),
-    [0xc6] = { (SSEFunc_0_epp)gen_helper_shufps,
-               (SSEFunc_0_epp)gen_helper_shufpd }, /* XXX: casts */
 
     /* SSSE3, SSE4, MOVBE, CRC32, BMI1, BMI2, ADX.  */
     [0x38] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },
@@ -6971,22 +6969,22 @@  DEF_GEN_INSN3_HELPER_EPP(pshufb, pshufb_mmx, Pq, Pq, Qq)
 DEF_GEN_INSN3_HELPER_EPP(pshufb, pshufb_xmm, Vdq, Vdq, Wdq)
 DEF_GEN_INSN3_HELPER_EPP(vpshufb, pshufb_xmm, Vdq, Hdq, Wdq)
 DEF_GEN_INSN3_HELPER_EPP(vpshufb, pshufb_xmm, Vqq, Hqq, Wqq)
-DEF_GEN_INSN3_HELPER_PPI(pshufw, pshufw_mmx, Pq, Qq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(pshuflw, pshuflw_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshuflw, pshuflw_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshuflw, pshuflw_xmm, Vqq, Wqq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(pshufhw, pshufhw_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshufhw, pshufhw_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshufhw, pshufhw_xmm, Vqq, Wqq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(pshufd, pshufd_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshufd, pshufd_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshufd, pshufd_xmm, Vqq, Wqq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(shufps, shufps, Vdq, Vdq, Wdq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(vshufps, shufps, Vdq, Hdq, Wdq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(vshufps, shufps, Vqq, Hqq, Wqq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(shufpd, shufpd, Vdq, Vdq, Wdq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(vshufpd, shufpd, Vdq, Hdq, Wdq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(vshufpd, shufpd, Vqq, Hqq, Wqq, Ib)
+DEF_GEN_INSN3_GVEC(pshufw, Pq, Qq, Ib, 2i_ool, MM_OPRSZ, MM_MAXSZ, pshufw_mmx)
+DEF_GEN_INSN3_GVEC(pshuflw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshuflw_xmm)
+DEF_GEN_INSN3_GVEC(vpshuflw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshuflw_xmm)
+DEF_GEN_INSN3_GVEC(vpshuflw, Vqq, Wqq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshuflw_xmm)
+DEF_GEN_INSN3_GVEC(pshufhw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufhw_xmm)
+DEF_GEN_INSN3_GVEC(vpshufhw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufhw_xmm)
+DEF_GEN_INSN3_GVEC(vpshufhw, Vqq, Wqq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufhw_xmm)
+DEF_GEN_INSN3_GVEC(pshufd, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufd_xmm)
+DEF_GEN_INSN3_GVEC(vpshufd, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufd_xmm)
+DEF_GEN_INSN3_GVEC(vpshufd, Vqq, Wqq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufd_xmm)
+DEF_GEN_INSN4_GVEC(shufps, Vdq, Vdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufps_xmm)
+DEF_GEN_INSN4_GVEC(vshufps, Vdq, Hdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufps_xmm)
+DEF_GEN_INSN4_GVEC(vshufps, Vqq, Hqq, Wqq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufps_xmm)
+DEF_GEN_INSN4_GVEC(shufpd, Vdq, Vdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufpd_xmm)
+DEF_GEN_INSN4_GVEC(vshufpd, Vdq, Hdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufpd_xmm)
+DEF_GEN_INSN4_GVEC(vshufpd, Vqq, Hqq, Wqq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufpd_xmm)
 
 DEF_GEN_INSN4_HELPER_EPPI(blendps, blendps_xmm, Vdq, Vdq, Wdq, Ib)
 DEF_GEN_INSN4_HELPER_EPPI(vblendps, blendps_xmm, Vdq, Hdq, Wdq, Ib)