From patchwork Fri Mar 15 10:58:22 2019
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Jan Beulich <JBeulich@suse.com>
X-Patchwork-Id: 10854515
Return-Path: <xen-devel-bounces@lists.xenproject.org>
Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org
 [172.30.200.125])
	by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 4F5A01575
	for <patchwork-xen-devel@patchwork.kernel.org>;
 Fri, 15 Mar 2019 11:00:09 +0000 (UTC)
Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1])
	by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 30D242A6AF
	for <patchwork-xen-devel@patchwork.kernel.org>;
 Fri, 15 Mar 2019 11:00:09 +0000 (UTC)
Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486)
	id 1F3122A948; Fri, 15 Mar 2019 11:00:09 +0000 (UTC)
X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on
	pdx-wl-mail.web.codeaurora.org
X-Spam-Level: 
X-Spam-Status: No, score=-5.2 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI,
	RCVD_IN_DNSWL_MED autolearn=ham version=3.3.1
Received: from lists.xenproject.org (lists.xenproject.org [192.237.175.120])
	(using TLSv1.2 with cipher AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by mail.wl.linuxfoundation.org (Postfix) with ESMTPS id 473092A6AF
	for <patchwork-xen-devel@patchwork.kernel.org>;
 Fri, 15 Mar 2019 11:00:08 +0000 (UTC)
Received: from localhost ([127.0.0.1] helo=lists.xenproject.org)
	by lists.xenproject.org with esmtp (Exim 4.89)
	(envelope-from <xen-devel-bounces@lists.xenproject.org>)
	id 1h4kXg-0007mS-TL; Fri, 15 Mar 2019 10:58:24 +0000
Received: from us1-rack-dfw2.inumbo.com ([104.130.134.6])
 by lists.xenproject.org with esmtp (Exim 4.89)
 (envelope-from
 <SRS0=64UD=RS=suse.com=jbeulich@srs-us1.protection.inumbo.net>)
 id 1h4kXf-0007m6-Ux
 for xen-devel@lists.xenproject.org; Fri, 15 Mar 2019 10:58:23 +0000
X-Inumbo-ID: 3fabcaf7-4711-11e9-bc90-bc764e045a96
Received: from prv1-mh.provo.novell.com (unknown [137.65.248.33])
 by us1-rack-dfw2.inumbo.com (Halon) with ESMTPS
 id 3fabcaf7-4711-11e9-bc90-bc764e045a96;
 Fri, 15 Mar 2019 10:58:22 +0000 (UTC)
Received: from INET-PRV1-MTA by prv1-mh.provo.novell.com
 with Novell_GroupWise; Fri, 15 Mar 2019 04:58:21 -0600
Message-Id: <5C8B854E020000780021F24B@prv1-mh.provo.novell.com>
X-Mailer: Novell GroupWise Internet Agent 18.1.0 
Date: Fri, 15 Mar 2019 04:58:22 -0600
From: "Jan Beulich" <JBeulich@suse.com>
To: "xen-devel" <xen-devel@lists.xenproject.org>
References: <5B6BF83602000078001DC548@prv1-mh.provo.novell.com>
 <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com>
In-Reply-To: <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com>
Mime-Version: 1.0
Content-Disposition: inline
Subject: [Xen-devel] [PATCH v8 31/50] x86emul: support remaining misc
 AVX512{F, BW} insns
X-BeenThere: xen-devel@lists.xenproject.org
X-Mailman-Version: 2.1.23
Precedence: list
List-Id: Xen developer discussion <xen-devel.lists.xenproject.org>
List-Unsubscribe: <https://lists.xenproject.org/mailman/options/xen-devel>,
 <mailto:xen-devel-request@lists.xenproject.org?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xenproject.org>
List-Help: <mailto:xen-devel-request@lists.xenproject.org?subject=help>
List-Subscribe: <https://lists.xenproject.org/mailman/listinfo/xen-devel>,
 <mailto:xen-devel-request@lists.xenproject.org?subject=subscribe>
Cc: George Dunlap <George.Dunlap@eu.citrix.com>,
 Andrew Cooper <andrew.cooper3@citrix.com>, Wei Liu <wei.liu2@citrix.com>,
 Roger Pau Monne <roger.pau@citrix.com>
Errors-To: xen-devel-bounces@lists.xenproject.org
Sender: "Xen-devel" <xen-devel-bounces@lists.xenproject.org>
X-Virus-Scanned: ClamAV using ClamSMTP

This completes support of AVX512BW in the insn emulator, and leaves just
the scatter/gather ones open in the AVX512F set.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
v5: New.
---
TBD: The *blendm* inline functions don't reliably produce the intended
     insns, as the respective moves are about as good a fit for the
     compiler when looking for a match for the intended operation. We'd
     need to switch to inline assembly if we wanted to guarantee the
     testing of those insns. Thoughts?

--- a/tools/tests/x86_emulator/evex-disp8.c
+++ b/tools/tests/x86_emulator/evex-disp8.c
@@ -105,6 +105,8 @@ enum esz {
 
 static const struct test avx512f_all[] = {
     INSN_FP(add,             0f, 58),
+    INSN(align,        66, 0f3a, 03,    vl,     dq, vl),
+    INSN(blendm,       66, 0f38, 65,    vl,     sd, vl),
     INSN(broadcastss,  66, 0f38, 18,    el,      d, el),
     INSN_FP(cmp,             0f, c2),
     INSN(comisd,       66,   0f, 2f,    el,      q, el),
@@ -207,6 +209,7 @@ static const struct test avx512f_all[] =
     INSN(paddq,        66,   0f, d4,    vl,      q, vl),
     INSN(pand,         66,   0f, db,    vl,     dq, vl),
     INSN(pandn,        66,   0f, df,    vl,     dq, vl),
+    INSN(pblendm,      66, 0f38, 64,    vl,     dq, vl),
 //       pbroadcast,   66, 0f38, 7c,          dq64
     INSN(pbroadcastd,  66, 0f38, 58,    el,      d, el),
     INSN(pbroadcastq,  66, 0f38, 59,    el,      q, el),
@@ -354,6 +357,7 @@ static const struct test avx512f_512[] =
 };
 
 static const struct test avx512bw_all[] = {
+    INSN(dbpsadbw,    66, 0f3a, 42,    vl,    b, vl),
     INSN(movdqu8,     f2,   0f, 6f,    vl,    b, vl),
     INSN(movdqu8,     f2,   0f, 7f,    vl,    b, vl),
     INSN(movdqu16,    f2,   0f, 6f,    vl,    w, vl),
@@ -373,6 +377,7 @@ static const struct test avx512bw_all[]
     INSN(palignr,     66, 0f3a, 0f,    vl,    b, vl),
     INSN(pavgb,       66,   0f, e0,    vl,    b, vl),
     INSN(pavgw,       66,   0f, e3,    vl,    w, vl),
+    INSN(pblendm,     66, 0f38, 66,    vl,   bw, vl),
     INSN(pbroadcastb, 66, 0f38, 78,    el,    b, el),
 //       pbroadcastb, 66, 0f38, 7a,           b
     INSN(pbroadcastw, 66, 0f38, 79,    el_2,  b, vl),
--- a/tools/tests/x86_emulator/simd.c
+++ b/tools/tests/x86_emulator/simd.c
@@ -297,7 +297,7 @@ static inline vec_t movlhps(vec_t x, vec
 #   define max(x, y) BR_(maxps, _mask, x, y, undef(), ~0)
 #   define min(x, y) BR_(minps, _mask, x, y, undef(), ~0)
 #  endif
-#  define mix(x, y) B(movaps, _mask, x, y, (0b0101010101010101 & ALL_TRUE))
+#  define mix(x, y) B(blendmps_, _mask, x, y, (0b1010101010101010 & ALL_TRUE))
 #  define scale(x, y) BR(scalefps, _mask, x, y, undef(), ~0)
 #  if VEC_SIZE == 64 && defined(__AVX512ER__)
 #   define recip(x) BR(rcp28ps, _mask, x, undef(), ~0)
@@ -370,7 +370,7 @@ static inline vec_t movlhps(vec_t x, vec
 #   define max(x, y) BR_(maxpd, _mask, x, y, undef(), ~0)
 #   define min(x, y) BR_(minpd, _mask, x, y, undef(), ~0)
 #  endif
-#  define mix(x, y) B(movapd, _mask, x, y, 0b01010101)
+#  define mix(x, y) B(blendmpd_, _mask, x, y, 0b10101010)
 #  define scale(x, y) BR(scalefpd, _mask, x, y, undef(), ~0)
 #  if VEC_SIZE == 64 && defined(__AVX512ER__)
 #   define recip(x) BR(rcp28pd, _mask, x, undef(), ~0)
@@ -564,8 +564,9 @@ static inline vec_t movlhps(vec_t x, vec
                              0b00011011, (vsi_t)undef(), ~0))
 #   define swap2(x) ((vec_t)B_(permvarsi, _mask, (vsi_t)(x), (vsi_t)(inv - 1), (vsi_t)undef(), ~0))
 #  endif
-#  define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \
-                              (0b0101010101010101 & ((1 << ELEM_COUNT) - 1))))
+#  define mix(x, y) ((vec_t)B(blendmd_, _mask, (vsi_t)(x), (vsi_t)(y), \
+                              (0b1010101010101010 & ((1 << ELEM_COUNT) - 1))))
+#  define rotr(x, n) ((vec_t)B(alignd, _mask, (vsi_t)(x), (vsi_t)(x), n, (vsi_t)undef(), ~0))
 #  define shrink1(x) ((half_t)B(pmovqd, _mask, (vdi_t)(x), (vsi_half_t){}, ~0))
 # elif INT_SIZE == 8 || UINT_SIZE == 8
 #  define broadcast(x) ({ \
@@ -602,7 +603,8 @@ static inline vec_t movlhps(vec_t x, vec
                              0b01001110, (vsi_t)undef(), ~0))
 #   define swap2(x) ((vec_t)B(permvardi, _mask, (vdi_t)(x), (vdi_t)(inv - 1), (vdi_t)undef(), ~0))
 #  endif
-#  define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101))
+#  define mix(x, y) ((vec_t)B(blendmq_, _mask, (vdi_t)(x), (vdi_t)(y), 0b10101010))
+#  define rotr(x, n) ((vec_t)B(alignq, _mask, (vdi_t)(x), (vdi_t)(x), n, (vdi_t)undef(), ~0))
 #  if VEC_SIZE == 32
 #   define swap3(x) ((vec_t)B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0))
 #  elif VEC_SIZE == 64
@@ -654,8 +656,8 @@ static inline vec_t movlhps(vec_t x, vec
 #   define interleave_hi(x, y) ((vec_t)B(vpermi2varqi, _mask, (vqi_t)(x), interleave_hi, (vqi_t)(y), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(vpermt2varqi, _mask, interleave_lo, (vqi_t)(x), (vqi_t)(y), ~0))
 #  endif
-#  define mix(x, y) ((vec_t)B(movdquqi, _mask, (vqi_t)(x), (vqi_t)(y), \
-                              (0b0101010101010101010101010101010101010101010101010101010101010101LL & ALL_TRUE)))
+#  define mix(x, y) ((vec_t)B(blendmb_, _mask, (vqi_t)(x), (vqi_t)(y), \
+                              (0b1010101010101010101010101010101010101010101010101010101010101010LL & ALL_TRUE)))
 #  define shrink1(x) ((half_t)B(pmovwb, _mask, (vhi_t)(x), (vqi_half_t){}, ~0))
 #  define shrink2(x) ((quarter_t)B(pmovdb, _mask, (vsi_t)(x), (vqi_quarter_t){}, ~0))
 #  define shrink3(x) ((eighth_t)B(pmovqb, _mask, (vdi_t)(x), (vqi_eighth_t){}, ~0))
@@ -687,8 +689,8 @@ static inline vec_t movlhps(vec_t x, vec
 #   define interleave_hi(x, y) ((vec_t)B(vpermi2varhi, _mask, (vhi_t)(x), interleave_hi, (vhi_t)(y), ~0))
 #   define interleave_lo(x, y) ((vec_t)B(vpermt2varhi, _mask, interleave_lo, (vhi_t)(x), (vhi_t)(y), ~0))
 #  endif
-#  define mix(x, y) ((vec_t)B(movdquhi, _mask, (vhi_t)(x), (vhi_t)(y), \
-                              (0b01010101010101010101010101010101 & ALL_TRUE)))
+#  define mix(x, y) ((vec_t)B(blendmw_, _mask, (vhi_t)(x), (vhi_t)(y), \
+                              (0b10101010101010101010101010101010 & ALL_TRUE)))
 #  define shrink1(x) ((half_t)B(pmovdw, _mask, (vsi_t)(x), (vhi_half_t){}, ~0))
 #  define shrink2(x) ((quarter_t)B(pmovqw, _mask, (vdi_t)(x), (vhi_quarter_t){}, ~0))
 #  define swap2(x) ((vec_t)B(permvarhi, _mask, (vhi_t)(x), (vhi_t)(inv - 1), (vhi_t)undef(), ~0))
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -484,6 +484,7 @@ static const struct ext0f38_table {
     [0x5b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x62] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_bw },
     [0x63] = { .simd_size = simd_packed_int, .to_mem = 1, .two_op = 1, .d8s = d8s_bw },
+    [0x64 ... 0x66] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x75 ... 0x76] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x77] = { .simd_size = simd_packed_fp, .d8s = d8s_vl },
     [0x78] = { .simd_size = simd_other, .two_op = 1 },
@@ -550,6 +551,7 @@ static const struct ext0f3a_table {
     [0x00] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl },
     [0x01] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
     [0x02] = { .simd_size = simd_packed_int },
+    [0x03] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
     [0x06] = { .simd_size = simd_packed_fp },
     [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl },
@@ -581,8 +583,7 @@ static const struct ext0f3a_table {
     [0x3b] = { .simd_size = simd_256, .to_mem = 1, .two_op = 1, .d8s = d8s_vl_by_2 },
     [0x3e ... 0x3f] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x40 ... 0x41] = { .simd_size = simd_packed_fp },
-    [0x42] = { .simd_size = simd_packed_int },
-    [0x43] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
+    [0x42 ... 0x43] = { .simd_size = simd_packed_int, .d8s = d8s_vl },
     [0x44] = { .simd_size = simd_packed_int },
     [0x46] = { .simd_size = simd_packed_int },
     [0x48 ... 0x49] = { .simd_size = simd_packed_fp, .four_op = 1 },
@@ -6204,6 +6205,8 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x47): /* vpsllv{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x4c): /* vrcp14p{s,d} [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x4e): /* vrsqrt14p{s,d} [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x64): /* vpblendm{d,q} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x65): /* vblendmp{s,d} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     avx512f_no_sae:
         host_and_vcpu_must_have(avx512f);
         generate_exception_if(ea.type != OP_MEM && evex.brs, EXC_UD);
@@ -6961,6 +6964,7 @@ x86_emulate(
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x0b): /* vpmulhrsw [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x1c): /* vpabsb [xyz]mm/mem,[xyz]mm{k} */
     case X86EMUL_OPC_EVEX_66(0x0f38, 0x1d): /* vpabsw [xyz]mm/mem,[xyz]mm{k} */
+    case X86EMUL_OPC_EVEX_66(0x0f38, 0x66): /* vpblendm{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         host_and_vcpu_must_have(avx512bw);
         generate_exception_if(evex.brs, EXC_UD);
         elem_bytes = 1 << (b & 1);
@@ -8130,10 +8134,12 @@ x86_emulate(
         goto simd_0f_to_gpr;
 
     CASE_SIMD_PACKED_FP(_EVEX, 0x0f, 0xc6): /* vshufp{s,d} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
-        fault_suppression = false;
         generate_exception_if(evex.w != (evex.pfx & VEX_PREFIX_DOUBLE_MASK),
                               EXC_UD);
         /* fall through */
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x03): /* valign{d,q} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        fault_suppression = false;
+        /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x25): /* vpternlog{d,q} $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
     avx512f_imm8_no_sae:
         host_and_vcpu_must_have(avx512f);
@@ -9471,6 +9477,9 @@ x86_emulate(
         insn_bytes = PFX_BYTES + 4;
         break;
 
+    case X86EMUL_OPC_EVEX_66(0x0f3a, 0x42): /* vdbpsadbw $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
+        generate_exception_if(evex.w, EXC_UD);
+        /* fall through */
     case X86EMUL_OPC_EVEX_66(0x0f3a, 0x0f): /* vpalignr $imm8,[xyz]mm/mem,[xyz]mm,[xyz]mm{k} */
         fault_suppression = false;
         goto avx512bw_imm;