@@ -551,70 +551,123 @@ void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *a, Reg *b,
}
#if SHIFT == 0
-void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
+void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *a, uint32_t desc)
{
- Reg r;
+ const intptr_t oprsz = simd_oprsz(desc);
+ const intptr_t maxsz = simd_maxsz(desc);
+ const uint8_t ctrl = simd_data(desc);
- r.W(0) = s->W(order & 3);
- r.W(1) = s->W((order >> 2) & 3);
- r.W(2) = s->W((order >> 4) & 3);
- r.W(3) = s->W((order >> 6) & 3);
- *d = r;
+ for (intptr_t i = 0; 4 * i * sizeof(uint16_t) < oprsz; ++i) {
+ const uint16_t t0 = a->W(4 * i + ((ctrl >> 0) & 3));
+ const uint16_t t1 = a->W(4 * i + ((ctrl >> 2) & 3));
+ const uint16_t t2 = a->W(4 * i + ((ctrl >> 4) & 3));
+ const uint16_t t3 = a->W(4 * i + ((ctrl >> 6) & 3));
+
+ d->W(4 * i + 0) = t0;
+ d->W(4 * i + 1) = t1;
+ d->W(4 * i + 2) = t2;
+ d->W(4 * i + 3) = t3;
+ }
+ glue(clear_high, SUFFIX)(d, oprsz, maxsz);
}
#else
-void helper_shufps(Reg *d, Reg *s, int order)
+void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *a, uint32_t desc)
{
- Reg r;
+ const intptr_t oprsz = simd_oprsz(desc);
+ const intptr_t maxsz = simd_maxsz(desc);
+ const uint8_t ctrl = simd_data(desc);
- r.L(0) = d->L(order & 3);
- r.L(1) = d->L((order >> 2) & 3);
- r.L(2) = s->L((order >> 4) & 3);
- r.L(3) = s->L((order >> 6) & 3);
- *d = r;
+ for (intptr_t i = 0; 8 * i * sizeof(uint16_t) < oprsz; ++i) {
+ const uint16_t t0 = a->W(8 * i + ((ctrl >> 0) & 3));
+ const uint16_t t1 = a->W(8 * i + ((ctrl >> 2) & 3));
+ const uint16_t t2 = a->W(8 * i + ((ctrl >> 4) & 3));
+ const uint16_t t3 = a->W(8 * i + ((ctrl >> 6) & 3));
+
+ d->W(8 * i + 0) = t0;
+ d->W(8 * i + 1) = t1;
+ d->W(8 * i + 2) = t2;
+ d->W(8 * i + 3) = t3;
+ d->Q(2 * i + 1) = a->Q(2 * i + 1);
+ }
+ glue(clear_high, SUFFIX)(d, oprsz, maxsz);
}
-void helper_shufpd(Reg *d, Reg *s, int order)
+void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *a, uint32_t desc)
{
- Reg r;
+ const intptr_t oprsz = simd_oprsz(desc);
+ const intptr_t maxsz = simd_maxsz(desc);
+ const uint8_t ctrl = simd_data(desc);
+
+ for (intptr_t i = 0; 8 * i * sizeof(uint16_t) < oprsz; ++i) {
+ const uint16_t t0 = a->W(8 * i + 4 + ((ctrl >> 0) & 3));
+ const uint16_t t1 = a->W(8 * i + 4 + ((ctrl >> 2) & 3));
+ const uint16_t t2 = a->W(8 * i + 4 + ((ctrl >> 4) & 3));
+ const uint16_t t3 = a->W(8 * i + 4 + ((ctrl >> 6) & 3));
- r.Q(0) = d->Q(order & 1);
- r.Q(1) = s->Q((order >> 1) & 1);
- *d = r;
+ d->Q(2 * i + 0) = a->Q(2 * i + 0);
+ d->W(8 * i + 4) = t0;
+ d->W(8 * i + 5) = t1;
+ d->W(8 * i + 6) = t2;
+ d->W(8 * i + 7) = t3;
+ }
+ glue(clear_high, SUFFIX)(d, oprsz, maxsz);
}
-void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
+void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *a, uint32_t desc)
{
- Reg r;
+ const intptr_t oprsz = simd_oprsz(desc);
+ const intptr_t maxsz = simd_maxsz(desc);
+ const uint8_t ctrl = simd_data(desc);
+
+ for (intptr_t i = 0; 4 * i * sizeof(uint32_t) < oprsz; ++i) {
+ const uint32_t t0 = a->L(4 * i + ((ctrl >> 0) & 3));
+ const uint32_t t1 = a->L(4 * i + ((ctrl >> 2) & 3));
+ const uint32_t t2 = a->L(4 * i + ((ctrl >> 4) & 3));
+ const uint32_t t3 = a->L(4 * i + ((ctrl >> 6) & 3));
+
+ d->L(4 * i + 0) = t0;
+ d->L(4 * i + 1) = t1;
+ d->L(4 * i + 2) = t2;
+ d->L(4 * i + 3) = t3;
- r.L(0) = s->L(order & 3);
- r.L(1) = s->L((order >> 2) & 3);
- r.L(2) = s->L((order >> 4) & 3);
- r.L(3) = s->L((order >> 6) & 3);
- *d = r;
+ }
+ glue(clear_high, SUFFIX)(d, oprsz, maxsz);
}
-void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
+void glue(helper_shufps, SUFFIX)(Reg *d, Reg *a, Reg *b, uint32_t desc)
{
- Reg r;
-
- r.W(0) = s->W(order & 3);
- r.W(1) = s->W((order >> 2) & 3);
- r.W(2) = s->W((order >> 4) & 3);
- r.W(3) = s->W((order >> 6) & 3);
- r.Q(1) = s->Q(1);
- *d = r;
+ const intptr_t oprsz = simd_oprsz(desc);
+ const intptr_t maxsz = simd_maxsz(desc);
+ const uint8_t ctrl = simd_data(desc);
+
+ for (intptr_t i = 0; 4 * i * sizeof(uint32_t) < oprsz; ++i) {
+ const uint32_t t0 = a->L(4 * i + ((ctrl >> 0) & 3));
+ const uint32_t t1 = a->L(4 * i + ((ctrl >> 2) & 3));
+ const uint32_t t2 = b->L(4 * i + ((ctrl >> 4) & 3));
+ const uint32_t t3 = b->L(4 * i + ((ctrl >> 6) & 3));
+
+ d->W(4 * i + 0) = t0;
+ d->W(4 * i + 1) = t1;
+ d->W(4 * i + 2) = t2;
+ d->W(4 * i + 3) = t3;
+ }
+ glue(clear_high, SUFFIX)(d, oprsz, maxsz);
}
-void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
+void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *a, Reg *b, uint32_t desc)
{
- Reg r;
-
- r.Q(0) = s->Q(0);
- r.W(4) = s->W(4 + (order & 3));
- r.W(5) = s->W(4 + ((order >> 2) & 3));
- r.W(6) = s->W(4 + ((order >> 4) & 3));
- r.W(7) = s->W(4 + ((order >> 6) & 3));
- *d = r;
+ const intptr_t oprsz = simd_oprsz(desc);
+ const intptr_t maxsz = simd_maxsz(desc);
+ const uint8_t ctrl = simd_data(desc);
+
+ for (intptr_t i = 0; 2 * i * sizeof(uint64_t) < oprsz; ++i) {
+ const uint64_t t0 = a->Q(2 * i + ((ctrl >> 0) & 1));
+ const uint64_t t1 = b->Q(2 * i + ((ctrl >> 1) & 1));
+
+ d->Q(2 * i + 0) = t0;
+ d->Q(2 * i + 1) = t1;
+ }
+ glue(clear_high, SUFFIX)(d, oprsz, maxsz);
}
#endif
@@ -78,13 +78,13 @@ DEF_HELPER_4(glue(psadbw, SUFFIX), void, Reg, Reg, Reg, i32)
DEF_HELPER_4(glue(maskmov, SUFFIX), void, env, Reg, Reg, tl)
#if SHIFT == 0
-DEF_HELPER_3(glue(pshufw, SUFFIX), void, Reg, Reg, int)
+DEF_HELPER_3(glue(pshufw, SUFFIX), void, Reg, Reg, i32)
#else
-DEF_HELPER_3(shufps, void, Reg, Reg, int)
-DEF_HELPER_3(shufpd, void, Reg, Reg, int)
-DEF_HELPER_3(glue(pshufd, SUFFIX), void, Reg, Reg, int)
-DEF_HELPER_3(glue(pshuflw, SUFFIX), void, Reg, Reg, int)
-DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, int)
+DEF_HELPER_3(glue(pshuflw, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(pshufhw, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_3(glue(pshufd, SUFFIX), void, Reg, Reg, i32)
+DEF_HELPER_4(glue(shufps, SUFFIX), void, Reg, Reg, Reg, i32)
+DEF_HELPER_4(glue(shufpd, SUFFIX), void, Reg, Reg, Reg, i32)
#endif
#if SHIFT == 1
@@ -2763,8 +2763,6 @@ static const SSEFunc_0_epp sse_op_table1[256][4] = {
[0x5b] = { gen_helper_cvtdq2ps, gen_helper_cvtps2dq, gen_helper_cvttps2dq },
[0xc2] = SSE_FOP(cmpeq),
- [0xc6] = { (SSEFunc_0_epp)gen_helper_shufps,
- (SSEFunc_0_epp)gen_helper_shufpd }, /* XXX: casts */
/* SSSE3, SSE4, MOVBE, CRC32, BMI1, BMI2, ADX. */
[0x38] = { SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL, SSE_SPECIAL },
@@ -6971,22 +6969,22 @@ DEF_GEN_INSN3_HELPER_EPP(pshufb, pshufb_mmx, Pq, Pq, Qq)
DEF_GEN_INSN3_HELPER_EPP(pshufb, pshufb_xmm, Vdq, Vdq, Wdq)
DEF_GEN_INSN3_HELPER_EPP(vpshufb, pshufb_xmm, Vdq, Hdq, Wdq)
DEF_GEN_INSN3_HELPER_EPP(vpshufb, pshufb_xmm, Vqq, Hqq, Wqq)
-DEF_GEN_INSN3_HELPER_PPI(pshufw, pshufw_mmx, Pq, Qq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(pshuflw, pshuflw_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshuflw, pshuflw_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshuflw, pshuflw_xmm, Vqq, Wqq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(pshufhw, pshufhw_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshufhw, pshufhw_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshufhw, pshufhw_xmm, Vqq, Wqq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(pshufd, pshufd_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshufd, pshufd_xmm, Vdq, Wdq, Ib)
-DEF_GEN_INSN3_HELPER_PPI(vpshufd, pshufd_xmm, Vqq, Wqq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(shufps, shufps, Vdq, Vdq, Wdq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(vshufps, shufps, Vdq, Hdq, Wdq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(vshufps, shufps, Vqq, Hqq, Wqq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(shufpd, shufpd, Vdq, Vdq, Wdq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(vshufpd, shufpd, Vdq, Hdq, Wdq, Ib)
-DEF_GEN_INSN4_HELPER_PPI(vshufpd, shufpd, Vqq, Hqq, Wqq, Ib)
+DEF_GEN_INSN3_GVEC(pshufw, Pq, Qq, Ib, 2i_ool, MM_OPRSZ, MM_MAXSZ, pshufw_mmx)
+DEF_GEN_INSN3_GVEC(pshuflw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshuflw_xmm)
+DEF_GEN_INSN3_GVEC(vpshuflw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshuflw_xmm)
+DEF_GEN_INSN3_GVEC(vpshuflw, Vqq, Wqq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshuflw_xmm)
+DEF_GEN_INSN3_GVEC(pshufhw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufhw_xmm)
+DEF_GEN_INSN3_GVEC(vpshufhw, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufhw_xmm)
+DEF_GEN_INSN3_GVEC(vpshufhw, Vqq, Wqq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufhw_xmm)
+DEF_GEN_INSN3_GVEC(pshufd, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufd_xmm)
+DEF_GEN_INSN3_GVEC(vpshufd, Vdq, Wdq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufd_xmm)
+DEF_GEN_INSN3_GVEC(vpshufd, Vqq, Wqq, Ib, 2i_ool, XMM_OPRSZ, XMM_MAXSZ, pshufd_xmm)
+DEF_GEN_INSN4_GVEC(shufps, Vdq, Vdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufps_xmm)
+DEF_GEN_INSN4_GVEC(vshufps, Vdq, Hdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufps_xmm)
+DEF_GEN_INSN4_GVEC(vshufps, Vqq, Hqq, Wqq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufps_xmm)
+DEF_GEN_INSN4_GVEC(shufpd, Vdq, Vdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufpd_xmm)
+DEF_GEN_INSN4_GVEC(vshufpd, Vdq, Hdq, Wdq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufpd_xmm)
+DEF_GEN_INSN4_GVEC(vshufpd, Vqq, Hqq, Wqq, Ib, 3i_ool, XMM_OPRSZ, XMM_MAXSZ, shufpd_xmm)
DEF_GEN_INSN4_HELPER_EPPI(blendps, blendps_xmm, Vdq, Vdq, Wdq, Ib)
DEF_GEN_INSN4_HELPER_EPPI(vblendps, blendps_xmm, Vdq, Hdq, Wdq, Ib)
Make these helpers suitable for use with tcg_gen_gvec_* functions. Signed-off-by: Jan Bobek <jan.bobek@gmail.com> --- target/i386/ops_sse.h | 141 ++++++++++++++++++++++++----------- target/i386/ops_sse_header.h | 12 +-- target/i386/translate.c | 34 ++++----- 3 files changed, 119 insertions(+), 68 deletions(-)