diff mbox series

[RFC,v4,75/75] target/i386: convert pmovmskb/movmskps/movmskpd helpers to gvec style

Message ID 20190821172951.15333-76-jan.bobek@gmail.com (mailing list archive)
State New, archived
Headers show
Series rewrite MMX/SSE*/AVX/AVX2 vector instruction translation | expand

Commit Message

Jan Bobek Aug. 21, 2019, 5:29 p.m. UTC
Make these helpers suitable for use with tcg_gen_gvec_* functions.

Signed-off-by: Jan Bobek <jan.bobek@gmail.com>
---
 target/i386/ops_sse.h        |  74 ++++++++++----------
 target/i386/ops_sse_header.h |   9 ++-
 target/i386/translate.c      | 132 ++++++-----------------------------
 3 files changed, 65 insertions(+), 150 deletions(-)

Comments

Richard Henderson Aug. 21, 2019, 11:53 p.m. UTC | #1
On 8/21/19 10:29 AM, Jan Bobek wrote:
> +    for (intptr_t i = 0; i * sizeof(uint8_t) < oprsz; ++i) {
> +        const uint8_t t = a->B(i) & (1 << 7);
> +        ret |= i < 8 ? t >> (7 - i) : t << (i - 7);

You can avoid this variable shift by doing

  uint32_t t = a->B(i) >> 7;
  ret |= t << i;

> +uint64_t glue(helper_pmovmskbq, SUFFIX)(Reg *a, uint32_t desc)
> +{
> +    return glue(helper_pmovmskbd, SUFFIX)(a, desc);
>  }
...
> +DEF_GEN_INSN2_GVEC(vpmovmskb, Gd, Uqq, sd1_ool, XMM_OPRSZ, XMM_MAXSZ, pmovmskbd_xmm)
> +DEF_GEN_INSN2_GVEC(vpmovmskb, Gq, Uqq, sq1_ool, XMM_OPRSZ, XMM_MAXSZ, pmovmskbq_xmm)

What is the difference between these two?

Given that we aren't attempting avx512, uint32_t is sufficient for all of the
bytes of a YMM register.

I have a feeling that some of this should simply use target_ulong, so that a
direct assignment to the general register can be done without extra extensions
within the generated code.


r~
diff mbox series

Patch

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 2e50d91a25..82562c9473 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -1169,52 +1169,56 @@  void helper_comisd(CPUX86State *env, Reg *d, Reg *s)
     CC_SRC = comis_eflags[ret + 1];
 }
 
-uint32_t helper_movmskps(CPUX86State *env, Reg *s)
+uint32_t helper_movmskpsd(Reg *a, uint32_t desc)
 {
-    int b0, b1, b2, b3;
+    const intptr_t oprsz = simd_oprsz(desc);
 
-    b0 = s->ZMM_L(0) >> 31;
-    b1 = s->ZMM_L(1) >> 31;
-    b2 = s->ZMM_L(2) >> 31;
-    b3 = s->ZMM_L(3) >> 31;
-    return b0 | (b1 << 1) | (b2 << 2) | (b3 << 3);
+    uint32_t ret = 0;
+    for (intptr_t i = 0; i * sizeof(uint32_t) < oprsz; ++i) {
+        const uint32_t t = a->ZMM_L(i) & (1UL << 31);
+        ret |= t >> (31 - i);
+    }
+    return ret;
 }
 
-uint32_t helper_movmskpd(CPUX86State *env, Reg *s)
+uint64_t helper_movmskpsq(Reg *a, uint32_t desc)
 {
-    int b0, b1;
+    return helper_movmskpsd(a, desc);
+}
+
+uint32_t helper_movmskpdd(Reg *a, uint32_t desc)
+{
+    const intptr_t oprsz = simd_oprsz(desc);
 
-    b0 = s->ZMM_L(1) >> 31;
-    b1 = s->ZMM_L(3) >> 31;
-    return b0 | (b1 << 1);
+    uint32_t ret = 0;
+    for (intptr_t i = 0; i * sizeof(uint64_t) < oprsz; ++i) {
+        const uint64_t t = a->ZMM_Q(i) & (1ULL << 63);
+        ret |= t >> (63 - i);
+    }
+    return ret;
 }
 
+uint64_t helper_movmskpdq(Reg *a, uint32_t desc)
+{
+    return helper_movmskpdd(a, desc);
+}
 #endif
 
-uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s)
+uint32_t glue(helper_pmovmskbd, SUFFIX)(Reg *a, uint32_t desc)
 {
-    uint32_t val;
-
-    val = 0;
-    val |= (s->B(0) >> 7);
-    val |= (s->B(1) >> 6) & 0x02;
-    val |= (s->B(2) >> 5) & 0x04;
-    val |= (s->B(3) >> 4) & 0x08;
-    val |= (s->B(4) >> 3) & 0x10;
-    val |= (s->B(5) >> 2) & 0x20;
-    val |= (s->B(6) >> 1) & 0x40;
-    val |= (s->B(7)) & 0x80;
-#if SHIFT == 1
-    val |= (s->B(8) << 1) & 0x0100;
-    val |= (s->B(9) << 2) & 0x0200;
-    val |= (s->B(10) << 3) & 0x0400;
-    val |= (s->B(11) << 4) & 0x0800;
-    val |= (s->B(12) << 5) & 0x1000;
-    val |= (s->B(13) << 6) & 0x2000;
-    val |= (s->B(14) << 7) & 0x4000;
-    val |= (s->B(15) << 8) & 0x8000;
-#endif
-    return val;
+    const intptr_t oprsz = simd_oprsz(desc);
+
+    uint32_t ret = 0;
+    for (intptr_t i = 0; i * sizeof(uint8_t) < oprsz; ++i) {
+        const uint8_t t = a->B(i) & (1 << 7);
+        ret |= i < 8 ? t >> (7 - i) : t << (i - 7);
+    }
+    return ret;
+}
+
+uint64_t glue(helper_pmovmskbq, SUFFIX)(Reg *a, uint32_t desc)
+{
+    return glue(helper_pmovmskbd, SUFFIX)(a, desc);
 }
 
 void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h
index 207d41e248..59ac1f28e3 100644
--- a/target/i386/ops_sse_header.h
+++ b/target/i386/ops_sse_header.h
@@ -178,11 +178,14 @@  DEF_HELPER_3(ucomiss, void, env, Reg, Reg)
 DEF_HELPER_3(comiss, void, env, Reg, Reg)
 DEF_HELPER_3(ucomisd, void, env, Reg, Reg)
 DEF_HELPER_3(comisd, void, env, Reg, Reg)
-DEF_HELPER_2(movmskps, i32, env, Reg)
-DEF_HELPER_2(movmskpd, i32, env, Reg)
+DEF_HELPER_2(movmskpsd, i32, Reg, i32)
+DEF_HELPER_2(movmskpsq, i64, Reg, i32)
+DEF_HELPER_2(movmskpdd, i32, Reg, i32)
+DEF_HELPER_2(movmskpdq, i64, Reg, i32)
 #endif
 
-DEF_HELPER_2(glue(pmovmskb, SUFFIX), i32, env, Reg)
+DEF_HELPER_2(glue(pmovmskbd, SUFFIX), i32, Reg, i32)
+DEF_HELPER_2(glue(pmovmskbq, SUFFIX), i64, Reg, i32)
 DEF_HELPER_3(glue(packsswb, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_3(glue(packuswb, SUFFIX), void, env, Reg, Reg)
 DEF_HELPER_3(glue(packssdw, SUFFIX), void, env, Reg, Reg)
diff --git a/target/i386/translate.c b/target/i386/translate.c
index bb4120a848..8f891b6e47 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -3339,20 +3339,6 @@  static void gen_sse(CPUX86State *env, DisasContext *s, int b)
                 goto illegal_op;
             }
             break;
-        case 0x050: /* movmskps */
-            rm = (modrm & 7) | REX_B(s);
-            tcg_gen_addi_ptr(s->ptr0, cpu_env,
-                             offsetof(CPUX86State,xmm_regs[rm]));
-            gen_helper_movmskps(s->tmp2_i32, cpu_env, s->ptr0);
-            tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp2_i32);
-            break;
-        case 0x150: /* movmskpd */
-            rm = (modrm & 7) | REX_B(s);
-            tcg_gen_addi_ptr(s->ptr0, cpu_env,
-                             offsetof(CPUX86State,xmm_regs[rm]));
-            gen_helper_movmskpd(s->tmp2_i32, cpu_env, s->ptr0);
-            tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp2_i32);
-            break;
         case 0x02a: /* cvtpi2ps */
         case 0x12a: /* cvtpi2pd */
             gen_helper_enter_mmx(cpu_env);
@@ -3524,24 +3510,6 @@  static void gen_sse(CPUX86State *env, DisasContext *s, int b)
             gen_op_movq(s, offsetof(CPUX86State, fpregs[reg & 7].mmx),
                         offsetof(CPUX86State,xmm_regs[rm].ZMM_Q(0)));
             break;
-        case 0xd7: /* pmovmskb */
-        case 0x1d7:
-            if (mod != 3)
-                goto illegal_op;
-            if (b1) {
-                rm = (modrm & 7) | REX_B(s);
-                tcg_gen_addi_ptr(s->ptr0, cpu_env,
-                                 offsetof(CPUX86State, xmm_regs[rm]));
-                gen_helper_pmovmskb_xmm(s->tmp2_i32, cpu_env, s->ptr0);
-            } else {
-                rm = (modrm & 7);
-                tcg_gen_addi_ptr(s->ptr0, cpu_env,
-                                 offsetof(CPUX86State, fpregs[rm].mmx));
-                gen_helper_pmovmskb_mmx(s->tmp2_i32, cpu_env, s->ptr0);
-            }
-            reg = ((modrm >> 3) & 7) | REX_R(s);
-            tcg_gen_extu_i32_tl(cpu_regs[reg], s->tmp2_i32);
-            break;
 
         case 0x138:
         case 0x038:
@@ -5773,88 +5741,28 @@  GEN_INSN2(vmovhpd, Mq, Vdq)
     gen_insn2(movhpd, Mq, Vdq)(env, s, arg1, arg2);
 }
 
-DEF_GEN_INSN2_HELPER_DEP(pmovmskb, pmovmskb_mmx, Gd, Nq)
-GEN_INSN2(pmovmskb, Gq, Nq)
-{
-    const TCGv_i32 arg1_r32 = tcg_temp_new_i32();
-    gen_insn2(pmovmskb, Gd, Nq)(env, s, arg1_r32, arg2);
-    tcg_gen_extu_i32_i64(arg1, arg1_r32);
-    tcg_temp_free_i32(arg1_r32);
-}
-DEF_GEN_INSN2_HELPER_DEP(pmovmskb, pmovmskb_xmm, Gd, Udq)
-GEN_INSN2(pmovmskb, Gq, Udq)
-{
-    const TCGv_i32 arg1_r32 = tcg_temp_new_i32();
-    gen_insn2(pmovmskb, Gd, Udq)(env, s, arg1_r32, arg2);
-    tcg_gen_extu_i32_i64(arg1, arg1_r32);
-    tcg_temp_free_i32(arg1_r32);
-}
-DEF_GEN_INSN2_HELPER_DEP(vpmovmskb, pmovmskb_xmm, Gd, Udq)
-GEN_INSN2(vpmovmskb, Gq, Udq)
-{
-    const TCGv_i32 arg1_r32 = tcg_temp_new_i32();
-    gen_insn2(vpmovmskb, Gd, Udq)(env, s, arg1_r32, arg2);
-    tcg_gen_extu_i32_i64(arg1, arg1_r32);
-    tcg_temp_free_i32(arg1_r32);
-}
-DEF_GEN_INSN2_HELPER_DEP(vpmovmskb, pmovmskb_xmm, Gd, Uqq)
-GEN_INSN2(vpmovmskb, Gq, Uqq)
-{
-    const TCGv_i32 arg1_r32 = tcg_temp_new_i32();
-    gen_insn2(vpmovmskb, Gd, Uqq)(env, s, arg1_r32, arg2);
-    tcg_gen_extu_i32_i64(arg1, arg1_r32);
-    tcg_temp_free_i32(arg1_r32);
-}
+DEF_GEN_INSN2_GVEC(pmovmskb, Gd, Nq, sd1_ool, MM_OPRSZ, MM_MAXSZ, pmovmskbd_mmx)
+DEF_GEN_INSN2_GVEC(pmovmskb, Gq, Nq, sq1_ool, MM_OPRSZ, MM_MAXSZ, pmovmskbq_mmx)
+DEF_GEN_INSN2_GVEC(pmovmskb, Gd, Udq, sd1_ool, XMM_OPRSZ, XMM_MAXSZ, pmovmskbd_xmm)
+DEF_GEN_INSN2_GVEC(pmovmskb, Gq, Udq, sq1_ool, XMM_OPRSZ, XMM_MAXSZ, pmovmskbq_xmm)
+DEF_GEN_INSN2_GVEC(vpmovmskb, Gd, Udq, sd1_ool, XMM_OPRSZ, XMM_MAXSZ, pmovmskbd_xmm)
+DEF_GEN_INSN2_GVEC(vpmovmskb, Gq, Udq, sq1_ool, XMM_OPRSZ, XMM_MAXSZ, pmovmskbq_xmm)
+DEF_GEN_INSN2_GVEC(vpmovmskb, Gd, Uqq, sd1_ool, XMM_OPRSZ, XMM_MAXSZ, pmovmskbd_xmm)
+DEF_GEN_INSN2_GVEC(vpmovmskb, Gq, Uqq, sq1_ool, XMM_OPRSZ, XMM_MAXSZ, pmovmskbq_xmm)
 
-DEF_GEN_INSN2_HELPER_DEP(movmskps, movmskps, Gd, Udq)
-GEN_INSN2(movmskps, Gq, Udq)
-{
-    const TCGv_i32 arg1_r32 = tcg_temp_new_i32();
-    gen_insn2(movmskps, Gd, Udq)(env, s, arg1_r32, arg2);
-    tcg_gen_extu_i32_i64(arg1, arg1_r32);
-    tcg_temp_free_i32(arg1_r32);
-}
-DEF_GEN_INSN2_HELPER_DEP(vmovmskps, movmskps, Gd, Udq)
-GEN_INSN2(vmovmskps, Gq, Udq)
-{
-    const TCGv_i32 arg1_r32 = tcg_temp_new_i32();
-    gen_insn2(vmovmskps, Gd, Udq)(env, s, arg1_r32, arg2);
-    tcg_gen_extu_i32_i64(arg1, arg1_r32);
-    tcg_temp_free_i32(arg1_r32);
-}
-DEF_GEN_INSN2_HELPER_DEP(vmovmskps, movmskps, Gd, Uqq)
-GEN_INSN2(vmovmskps, Gq, Uqq)
-{
-    const TCGv_i32 arg1_r32 = tcg_temp_new_i32();
-    gen_insn2(vmovmskps, Gd, Uqq)(env, s, arg1_r32, arg2);
-    tcg_gen_extu_i32_i64(arg1, arg1_r32);
-    tcg_temp_free_i32(arg1_r32);
-}
+DEF_GEN_INSN2_GVEC(movmskps, Gd, Udq, sd1_ool, XMM_OPRSZ, XMM_MAXSZ, movmskpsd)
+DEF_GEN_INSN2_GVEC(movmskps, Gq, Udq, sq1_ool, XMM_OPRSZ, XMM_MAXSZ, movmskpsq)
+DEF_GEN_INSN2_GVEC(vmovmskps, Gd, Udq, sd1_ool, XMM_OPRSZ, XMM_MAXSZ, movmskpsd)
+DEF_GEN_INSN2_GVEC(vmovmskps, Gq, Udq, sq1_ool, XMM_OPRSZ, XMM_MAXSZ, movmskpsq)
+DEF_GEN_INSN2_GVEC(vmovmskps, Gd, Uqq, sd1_ool, XMM_OPRSZ, XMM_MAXSZ, movmskpsd)
+DEF_GEN_INSN2_GVEC(vmovmskps, Gq, Uqq, sq1_ool, XMM_OPRSZ, XMM_MAXSZ, movmskpsq)
 
-DEF_GEN_INSN2_HELPER_DEP(movmskpd, movmskpd, Gd, Udq)
-GEN_INSN2(movmskpd, Gq, Udq)
-{
-    const TCGv_i32 arg1_r32 = tcg_temp_new_i32();
-    gen_insn2(movmskpd, Gd, Udq)(env, s, arg1_r32, arg2);
-    tcg_gen_extu_i32_i64(arg1, arg1_r32);
-    tcg_temp_free_i32(arg1_r32);
-}
-DEF_GEN_INSN2_HELPER_DEP(vmovmskpd, movmskpd, Gd, Udq)
-GEN_INSN2(vmovmskpd, Gq, Udq)
-{
-    const TCGv_i32 arg1_r32 = tcg_temp_new_i32();
-    gen_insn2(vmovmskpd, Gd, Udq)(env, s, arg1_r32, arg2);
-    tcg_gen_extu_i32_i64(arg1, arg1_r32);
-    tcg_temp_free_i32(arg1_r32);
-}
-DEF_GEN_INSN2_HELPER_DEP(vmovmskpd, movmskpd, Gd, Uqq)
-GEN_INSN2(vmovmskpd, Gq, Uqq)
-{
-    const TCGv_i32 arg1_r32 = tcg_temp_new_i32();
-    gen_insn2(vmovmskpd, Gd, Uqq)(env, s, arg1_r32, arg2);
-    tcg_gen_extu_i32_i64(arg1, arg1_r32);
-    tcg_temp_free_i32(arg1_r32);
-}
+DEF_GEN_INSN2_GVEC(movmskpd, Gd, Udq, sd1_ool, XMM_OPRSZ, XMM_MAXSZ, movmskpdd)
+DEF_GEN_INSN2_GVEC(movmskpd, Gq, Udq, sq1_ool, XMM_OPRSZ, XMM_MAXSZ, movmskpdq)
+DEF_GEN_INSN2_GVEC(vmovmskpd, Gd, Udq, sd1_ool, XMM_OPRSZ, XMM_MAXSZ, movmskpdd)
+DEF_GEN_INSN2_GVEC(vmovmskpd, Gq, Udq, sq1_ool, XMM_OPRSZ, XMM_MAXSZ, movmskpdq)
+DEF_GEN_INSN2_GVEC(vmovmskpd, Gd, Uqq, sd1_ool, XMM_OPRSZ, XMM_MAXSZ, movmskpdd)
+DEF_GEN_INSN2_GVEC(vmovmskpd, Gq, Uqq, sq1_ool, XMM_OPRSZ, XMM_MAXSZ, movmskpdq)
 
 GEN_INSN2(lddqu, Vdq, Mdq)
 {