diff mbox series

[v8,09/50] x86emul: support AVX512{F, BW} integer unpack insns

Message ID 5C8B813F020000780021F164@prv1-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show
Series x86emul: remaining AVX512 support | expand

Commit Message

Jan Beulich March 15, 2019, 10:41 a.m. UTC
There's once again one extra twobyte_table[] entry which gets its Disp8
shift value set right away without getting support implemented just yet,
again to avoid needlessly splitting groups of entries.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v8: Re-base.
v6: Re-base over changes earlier in the series.
v4: Move OVR() additions into __AVX512VL__ conditional.
v3: New.

Comments

Andrew Cooper March 15, 2019, 6:21 p.m. UTC | #1
On 15/03/2019 10:41, Jan Beulich wrote:
> @@ -6681,6 +6681,12 @@ x86_emulate(
>      case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw [xyz]mm/mem,[xyz]mm,[xyz]mm */
>          generate_exception_if(evex.opmsk, EXC_UD);
>          /* fall through */
> +    case X86EMUL_OPC_EVEX_66(0x0f, 0x60): /* vpunpcklbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
> +    case X86EMUL_OPC_EVEX_66(0x0f, 0x61): /* vpunpcklwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
> +    case X86EMUL_OPC_EVEX_66(0x0f, 0x68): /* vpunpckhbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
> +    case X86EMUL_OPC_EVEX_66(0x0f, 0x69): /* vpunpckhwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
> +        op_bytes = 16 << evex.lr;
> +        /* fall through */

If this setting of op_bytes is safe to do for vpsadbw, how does the
emulation currently work?

~Andrew

>      case X86EMUL_OPC_EVEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,[xyz]mm,[xyz]mm{k} */
>      case X86EMUL_OPC_EVEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,[xyz]mm,[xyz]mm{k} */
>      case X86EMUL_OPC_EVEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,[xyz]mm,[xyz]mm{k} */
Jan Beulich March 18, 2019, 9:55 a.m. UTC | #2
>>> On 15.03.19 at 19:21, <andrew.cooper3@citrix.com> wrote:
> On 15/03/2019 10:41, Jan Beulich wrote:
>> @@ -6681,6 +6681,12 @@ x86_emulate(
>>      case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw [xyz]mm/mem,[xyz]mm,[xyz]mm */
>>          generate_exception_if(evex.opmsk, EXC_UD);
>>          /* fall through */
>> +    case X86EMUL_OPC_EVEX_66(0x0f, 0x60): /* vpunpcklbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
>> +    case X86EMUL_OPC_EVEX_66(0x0f, 0x61): /* vpunpcklwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
>> +    case X86EMUL_OPC_EVEX_66(0x0f, 0x68): /* vpunpckhbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
>> +    case X86EMUL_OPC_EVEX_66(0x0f, 0x69): /* vpunpckhwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
>> +        op_bytes = 16 << evex.lr;
>> +        /* fall through */
> 
> If this setting of op_bytes is safe to do for vpsadbw, how does the
> emulation currently work?

The setting is redundant for VPSADBW (there it gets set by virtue
of its table entry saying simd_packed_int), but it's necessary for
VUNPCK* as their table entries use simd_other, which is necessary
because of the memory access pattern of PUNPCKL*. In fact the
PUNPCKH* entries could equally well use simd_packed_int, but
that would then call for their case labels to get moved away from
the PUNPCKL* ones, and I slightly prefer them to be kept together.

Jan
Andrew Cooper May 20, 2019, 12:11 p.m. UTC | #3
On 18/03/2019 09:55, Jan Beulich wrote:
>>>> On 15.03.19 at 19:21, <andrew.cooper3@citrix.com> wrote:
>> On 15/03/2019 10:41, Jan Beulich wrote:
>>> @@ -6681,6 +6681,12 @@ x86_emulate(
>>>      case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw [xyz]mm/mem,[xyz]mm,[xyz]mm */
>>>          generate_exception_if(evex.opmsk, EXC_UD);
>>>          /* fall through */
>>> +    case X86EMUL_OPC_EVEX_66(0x0f, 0x60): /* vpunpcklbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
>>> +    case X86EMUL_OPC_EVEX_66(0x0f, 0x61): /* vpunpcklwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
>>> +    case X86EMUL_OPC_EVEX_66(0x0f, 0x68): /* vpunpckhbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
>>> +    case X86EMUL_OPC_EVEX_66(0x0f, 0x69): /* vpunpckhwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
>>> +        op_bytes = 16 << evex.lr;
>>> +        /* fall through */
>> If this setting of op_bytes is safe to do for vpsadbw, how does the
>> emulation currently work?
> The setting is redundant for VPSADBW (there it gets set by virtue
> of its table entry saying simd_packed_int), but it's necessary for
> VUNPCK* as their table entries use simd_other, which is necessary
> because of the memory access pattern of PUNPCKL*. In fact the
> PUNPCKH* entries could equally well use simd_packed_int, but
> that would then call for their case labels to get moved away from
> the PUNPCKL* ones, and I slightly prefer them to be kept together.

Ok.

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
diff mbox series

Patch

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -229,6 +229,10 @@  static const struct test avx512f_all[] =
     INSN(pternlog,     66, 0f3a, 25,    vl,     dq, vl),
     INSN(ptestm,       66, 0f38, 27,    vl,     dq, vl),
     INSN(ptestnm,      f3, 0f38, 27,    vl,     dq, vl),
+    INSN(punpckhdq,    66,   0f, 6a,    vl,      d, vl),
+    INSN(punpckhqdq,   66,   0f, 6d,    vl,      q, vl),
+    INSN(punpckldq,    66,   0f, 62,    vl,      d, vl),
+    INSN(punpcklqdq,   66,   0f, 6c,    vl,      q, vl),
     INSN(pxor,         66,   0f, ef,    vl,     dq, vl),
     INSN_PFP(shuf,           0f, c6),
     INSN_FP(sqrt,            0f, 51),
@@ -327,6 +331,10 @@  static const struct test avx512bw_all[]
     INSN(psubw,       66,   0f, f9,    vl,    w, vl),
     INSN(ptestm,      66, 0f38, 26,    vl,   bw, vl),
     INSN(ptestnm,     f3, 0f38, 26,    vl,   bw, vl),
+    INSN(punpckhbw,   66,   0f, 68,    vl,    b, vl),
+    INSN(punpckhwd,   66,   0f, 69,    vl,    w, vl),
+    INSN(punpcklbw,   66,   0f, 60,    vl,    b, vl),
+    INSN(punpcklwd,   66,   0f, 61,    vl,    w, vl),
 };
 
 static const struct test avx512bw_128[] = {
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -300,6 +300,10 @@  static inline bool _to_bool(byte_vec_t b
     asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \
     t_; \
 })
+#  if VEC_SIZE == 16
+#   define interleave_hi(x, y) ((vec_t)B(punpckhdq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(punpckldq, _mask, (vsi_t)(x), (vsi_t)(y), (vsi_t)undef(), ~0))
+#  endif
 #  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
                               (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
 #  define shrink1(x) ((half_t)B(pmovqd, _mask, (vdi_t)(x), (vsi_half_t){}, ~0))
@@ -317,6 +321,10 @@  static inline bool _to_bool(byte_vec_t b
     t_; \
 })
 #  endif
+#  if VEC_SIZE == 16
+#   define interleave_hi(x, y) ((vec_t)B(punpckhqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#   define interleave_lo(x, y) ((vec_t)B(punpcklqdq, _mask, (vdi_t)(x), (vdi_t)(y), (vdi_t)undef(), ~0))
+#  endif
 #  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
 # endif
 # if INT_SIZE == 4
--- a/tools/tests/x86_emulator/simd.h
+++ b/tools/tests/x86_emulator/simd.h
@@ -252,6 +252,10 @@  OVR(pmovzxwq);
 OVR(pmulld);
 OVR(pmuldq);
 OVR(pmuludq);
+OVR(punpckhdq);
+OVR(punpckhqdq);
+OVR(punpckldq);
+OVR(punpcklqdq);
 # endif
 
 # undef OVR_VFP
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -312,10 +312,10 @@  static const struct twobyte_table {
     [0x58 ... 0x59] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
     [0x5a ... 0x5b] = { DstImplicit|SrcMem|ModRM|Mov, simd_other },
     [0x5c ... 0x5f] = { DstImplicit|SrcMem|ModRM, simd_any_fp, d8s_vl },
-    [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other },
+    [0x60 ... 0x62] = { DstImplicit|SrcMem|ModRM, simd_other, d8s_vl },
     [0x63 ... 0x67] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
-    [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other },
-    [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int },
+    [0x68 ... 0x6a] = { DstImplicit|SrcMem|ModRM, simd_other, d8s_vl },
+    [0x6b ... 0x6d] = { DstImplicit|SrcMem|ModRM, simd_packed_int, d8s_vl },
     [0x6e] = { DstImplicit|SrcMem|ModRM|Mov, simd_none, d8s_dq64 },
     [0x6f] = { DstImplicit|SrcMem|ModRM|Mov, simd_packed_int, d8s_vl },
     [0x70] = { SrcImmByte|ModRM|TwoOp, simd_other },
@@ -6681,6 +6681,12 @@  x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf6): /* vpsadbw [xyz]mm/mem,[xyz]mm,[xyz]mm */
         generate_exception_if(evex.opmsk, EXC_UD);
         /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x60): /* vpunpcklbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x61): /* vpunpcklwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x68): /* vpunpckhbw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x69): /* vpunpckhwd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        op_bytes = 16 << evex.lr;
+        /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd1): /* vpsrlw xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xe1): /* vpsraw xmm/m128,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf1): /* vpsllw xmm/m128,[xyz]mm,[xyz]mm{k} */
@@ -6708,6 +6714,13 @@  x86_emulate(
         elem_bytes = 1 << (b & 1);
         goto avx512f_no_sae;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x62): /* vpunpckldq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6a): /* vpunpckhdq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(evex.w, EXC_UD);
+        fault_suppression = false;
+        op_bytes = 16 << evex.lr;
+        goto avx512f_no_sae;
+
     case X86EMUL_OPC_EVEX_F3(0x0f38, 0x26): /* vptestnm{b,w} [xyz]mm/mem,[xyz]mm,k{k} */
     case X86EMUL_OPC_EVEX_F3(0x0f38, 0x27): /* vptestnm{d,q} [xyz]mm/mem,[xyz]mm,k{k} */
         op_bytes = 16 << evex.lr;
@@ -6734,6 +6747,10 @@  x86_emulate(
         avx512_vlen_check(false);
         goto simd_zmm;
 
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6c): /* vpunpcklqdq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f, 0x6d): /* vpunpckhqdq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        fault_suppression = false;
+        /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xd4): /* vpaddq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f, 0xf4): /* vpmuludq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x28): /* vpmuldq [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */