From patchwork Fri Mar 15 10:39:01 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Jan Beulich X-Patchwork-Id: 10854455 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id E85841575 for ; Fri, 15 Mar 2019 10:40:50 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id CD5F72A933 for ; Fri, 15 Mar 2019 10:40:50 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id C0C862A937; Fri, 15 Mar 2019 10:40:50 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-5.2 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_MED autolearn=ham version=3.3.1 Received: from lists.xenproject.org (lists.xenproject.org [192.237.175.120]) (using TLSv1.2 with cipher AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.wl.linuxfoundation.org (Postfix) with ESMTPS id 06AAF2A933 for ; Fri, 15 Mar 2019 10:40:50 +0000 (UTC) Received: from localhost ([127.0.0.1] helo=lists.xenproject.org) by lists.xenproject.org with esmtp (Exim 4.89) (envelope-from ) id 1h4kEz-0003K1-1T; Fri, 15 Mar 2019 10:39:05 +0000 Received: from us1-rack-dfw2.inumbo.com ([104.130.134.6]) by lists.xenproject.org with esmtp (Exim 4.89) (envelope-from ) id 1h4kEy-0003Ji-0l for xen-devel@lists.xenproject.org; Fri, 15 Mar 2019 10:39:04 +0000 X-Inumbo-ID: 8c55f094-470e-11e9-bc90-bc764e045a96 Received: from prv1-mh.provo.novell.com (unknown [137.65.248.33]) by us1-rack-dfw2.inumbo.com (Halon) with ESMTPS id 8c55f094-470e-11e9-bc90-bc764e045a96; Fri, 15 Mar 2019 10:39:02 +0000 (UTC) Received: from INET-PRV1-MTA by prv1-mh.provo.novell.com with Novell_GroupWise; Fri, 15 Mar 2019 04:39:01 -0600 Message-Id: <5C8B80C5020000780021F122@prv1-mh.provo.novell.com> X-Mailer: Novell GroupWise Internet Agent 18.1.0 Date: Fri, 15 Mar 2019 04:39:01 -0600 From: "Jan Beulich" To: "xen-devel" References: <5B6BF83602000078001DC548@prv1-mh.provo.novell.com> <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> In-Reply-To: <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> Mime-Version: 1.0 Content-Disposition: inline Subject: [Xen-devel] [PATCH v8 05/50] x86emul: support AVX512{F, BW, DQ} integer broadcast insns X-BeenThere: xen-devel@lists.xenproject.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: Xen developer discussion List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Cc: George Dunlap , Andrew Cooper , Wei Liu , Roger Pau Monne Errors-To: xen-devel-bounces@lists.xenproject.org Sender: "Xen-devel" X-Virus-Scanned: ClamAV using ClamSMTP Note that the pbroadcastw table entry in evex-disp8.c is slightly different from what one would expect, due to it requiring EVEX.W to be zero. Signed-off-by: Jan Beulich --- v7: Use dummy output in invoke_stub(). Re-base. v3: New. --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -164,6 +164,9 @@ static const struct test avx512f_all[] = INSN(paddq, 66, 0f, d4, vl, q, vl), INSN(pand, 66, 0f, db, vl, dq, vl), INSN(pandn, 66, 0f, df, vl, dq, vl), +// pbroadcast, 66, 0f38, 7c, dq64 + INSN(pbroadcastd, 66, 0f38, 58, el, d, el), + INSN(pbroadcastq, 66, 0f38, 59, el, q, el), INSN(pcmp, 66, 0f3a, 1f, vl, dq, vl), INSN(pcmpeqd, 66, 0f, 76, vl, d, vl), INSN(pcmpeqq, 66, 0f38, 29, vl, q, vl), @@ -222,6 +225,7 @@ static const struct test avx512f_128[] = static const struct test avx512f_no128[] = { INSN(broadcastf32x4, 66, 0f38, 1a, el_4, d, vl), + INSN(broadcasti32x4, 66, 0f38, 5a, el_4, d, vl), INSN(broadcastsd, 66, 0f38, 19, el, q, el), INSN(extractf32x4, 66, 0f3a, 19, el_4, d, vl), INSN(extracti32x4, 66, 0f3a, 39, el_4, d, vl), @@ -231,6 +235,7 @@ static const struct test avx512f_no128[] static const struct test avx512f_512[] = { INSN(broadcastf64x4, 66, 0f38, 1b, el_4, q, vl), + INSN(broadcasti64x4, 66, 0f38, 5b, el_4, q, vl), INSN(extractf64x4, 66, 0f3a, 1b, el_4, q, vl), INSN(extracti64x4, 66, 0f3a, 3b, el_4, q, vl), INSN(insertf64x4, 66, 0f3a, 1a, el_4, q, vl), @@ -250,6 +255,10 @@ static const struct test avx512bw_all[] INSN(paddw, 66, 0f, fd, vl, w, vl), INSN(pavgb, 66, 0f, e0, vl, b, vl), INSN(pavgw, 66, 0f, e3, vl, w, vl), + INSN(pbroadcastb, 66, 0f38, 78, el, b, el), +// pbroadcastb, 66, 0f38, 7a, b + INSN(pbroadcastw, 66, 0f38, 79, el_2, b, vl), +// pbroadcastw, 66, 0f38, 7b, b INSN(pcmp, 66, 0f3a, 3f, vl, bw, vl), INSN(pcmpeqb, 66, 0f, 74, vl, b, vl), INSN(pcmpeqw, 66, 0f, 75, vl, w, vl), @@ -301,6 +310,7 @@ static const struct test avx512bw_128[] static const struct test avx512dq_all[] = { INSN_PFP(and, 0f, 54), INSN_PFP(andn, 0f, 55), + INSN(broadcasti32x2, 66, 0f38, 59, el_2, d, vl), INSN_PFP(or, 0f, 56), INSN(pmullq, 66, 0f38, 40, vl, q, vl), INSN_PFP(xor, 0f, 57), @@ -314,6 +324,7 @@ static const struct test avx512dq_128[] static const struct test avx512dq_no128[] = { INSN(broadcastf32x2, 66, 0f38, 19, el_2, d, vl), INSN(broadcastf64x2, 66, 0f38, 1a, el_2, q, vl), + INSN(broadcasti64x2, 66, 0f38, 5a, el_2, q, vl), INSN(extractf64x2, 66, 0f3a, 19, el_2, q, vl), INSN(extracti64x2, 66, 0f3a, 39, el_2, q, vl), INSN(insertf64x2, 66, 0f3a, 18, el_2, q, vl), @@ -322,6 +333,7 @@ static const struct test avx512dq_no128[ static const struct test avx512dq_512[] = { INSN(broadcastf32x8, 66, 0f38, 1b, el_8, d, vl), + INSN(broadcasti32x8, 66, 0f38, 5b, el_8, d, vl), INSN(extractf32x8, 66, 0f3a, 1b, el_8, d, vl), INSN(extracti32x8, 66, 0f3a, 3b, el_8, d, vl), INSN(insertf32x8, 66, 0f3a, 1a, el_8, d, vl), --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -278,9 +278,33 @@ static inline bool _to_bool(byte_vec_t b #if (INT_SIZE == 4 || UINT_SIZE == 4 || INT_SIZE == 8 || UINT_SIZE == 8) && \ defined(__AVX512F__) && (VEC_SIZE == 64 || defined(__AVX512VL__)) # if INT_SIZE == 4 || UINT_SIZE == 4 +# define broadcast(x) ({ \ + vec_t t_; \ + asm ( "%{evex%} vpbroadcastd %1, %0" \ + : "=v" (t_) : "m" (*(int[1]){ x }) ); \ + t_; \ +}) +# define broadcast2(x) ({ \ + vec_t t_; \ + asm ( "vpbroadcastd %k1, %0" : "=v" (t_) : "r" (x) ); \ + t_; \ +}) # define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \ (0b0101010101010101 & ((1 << ELEM_COUNT) - 1)))) # elif INT_SIZE == 8 || UINT_SIZE == 8 +# define broadcast(x) ({ \ + vec_t t_; \ + asm ( "%{evex%} vpbroadcastq %1, %0" \ + : "=v" (t_) : "m" (*(long long[1]){ x }) ); \ + t_; \ +}) +# ifdef __x86_64__ +# define broadcast2(x) ({ \ + vec_t t_; \ + asm ( "vpbroadcastq %1, %0" : "=v" (t_) : "r" ((x) + 0ULL) ); \ + t_; \ +}) +# endif # define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101)) # endif # if INT_SIZE == 4 @@ -977,10 +1001,14 @@ int simd_test(void) if ( !eq(swap2(src), inv) ) return __LINE__; #endif -#if defined(broadcast) +#ifdef broadcast if ( !eq(broadcast(ELEM_COUNT + 1), src + inv) ) return __LINE__; #endif +#ifdef broadcast2 + if ( !eq(broadcast2(ELEM_COUNT + 1), src + inv) ) return __LINE__; +#endif + #if defined(interleave_lo) && defined(interleave_hi) touch(src); x = interleave_lo(inv, src); --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -454,9 +454,13 @@ static const struct ext0f38_table { [0x40] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x41] = { .simd_size = simd_packed_int, .two_op = 1 }, [0x45 ... 0x47] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, - [0x58 ... 0x59] = { .simd_size = simd_other, .two_op = 1 }, - [0x5a] = { .simd_size = simd_128, .two_op = 1 }, - [0x78 ... 0x79] = { .simd_size = simd_other, .two_op = 1 }, + [0x58] = { .simd_size = simd_other, .two_op = 1, .d8s = 2 }, + [0x59] = { .simd_size = simd_other, .two_op = 1, .d8s = 3 }, + [0x5a] = { .simd_size = simd_128, .two_op = 1, .d8s = 4 }, + [0x5b] = { .simd_size = simd_256, .two_op = 1, .d8s = d8s_vl_by_2 }, + [0x78] = { .simd_size = simd_other, .two_op = 1 }, + [0x79] = { .simd_size = simd_other, .two_op = 1, .d8s = 1 }, + [0x7a ... 0x7c] = { .simd_size = simd_none, .two_op = 1 }, [0x8c] = { .simd_size = simd_packed_int }, [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 }, [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 }, @@ -2636,6 +2640,11 @@ x86_decode_0f38( ctxt->opcode |= MASK_INSR(vex.pfx, X86EMUL_OPC_PFX_MASK); break; + case X86EMUL_OPC_EVEX_66(0, 0x7a): /* vpbroadcastb */ + case X86EMUL_OPC_EVEX_66(0, 0x7b): /* vpbroadcastw */ + case X86EMUL_OPC_EVEX_66(0, 0x7c): /* vpbroadcast{d,q} */ + break; + case 0xf0: /* movbe / crc32 */ state->desc |= repne_prefix() ? ByteOp : Mov; if ( rep_prefix() ) @@ -8233,6 +8242,8 @@ x86_emulate( goto avx512f_no_sae; case X86EMUL_OPC_EVEX_66(0x0f38, 0x18): /* vbroadcastss xmm/m32,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x58): /* vpbroadcastd xmm/m32,[xyz]mm{k} */ + op_bytes = elem_bytes; generate_exception_if(evex.w || evex.brs, EXC_UD); avx512_broadcast: /* @@ -8252,17 +8263,27 @@ x86_emulate( case X86EMUL_OPC_EVEX_66(0x0f38, 0x1b): /* vbroadcastf32x8 m256,zmm{k} */ /* vbroadcastf64x4 m256,zmm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x5b): /* vbroadcasti32x8 m256,zmm{k} */ + /* vbroadcasti64x4 m256,zmm{k} */ generate_exception_if(ea.type != OP_MEM || evex.lr != 2, EXC_UD); /* fall through */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x19): /* vbroadcastsd xmm/m64,{y,z}mm{k} */ /* vbroadcastf32x2 xmm/m64,{y,z}mm{k} */ - generate_exception_if(!evex.lr || evex.brs, EXC_UD); + generate_exception_if(!evex.lr, EXC_UD); + /* fall through */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x59): /* vpbroadcastq xmm/m64,[xyz]mm{k} */ + /* vbroadcasti32x2 xmm/m64,[xyz]mm{k} */ + if ( b == 0x59 ) + op_bytes = 8; + generate_exception_if(evex.brs, EXC_UD); if ( !evex.w ) host_and_vcpu_must_have(avx512dq); goto avx512_broadcast; case X86EMUL_OPC_EVEX_66(0x0f38, 0x1a): /* vbroadcastf32x4 m128,{y,z}mm{k} */ /* vbroadcastf64x2 m128,{y,z}mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x5a): /* vbroadcasti32x4 m128,{y,z}mm{k} */ + /* vbroadcasti64x2 m128,{y,z}mm{k} */ generate_exception_if(ea.type != OP_MEM || !evex.lr || evex.brs, EXC_UD); if ( evex.w ) @@ -8456,6 +8477,45 @@ x86_emulate( generate_exception_if(ea.type != OP_MEM || !vex.l || vex.w, EXC_UD); goto simd_0f_avx2; + case X86EMUL_OPC_EVEX_66(0x0f38, 0x78): /* vpbroadcastb xmm/m8,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x79): /* vpbroadcastw xmm/m16,[xyz]mm{k} */ + host_and_vcpu_must_have(avx512bw); + generate_exception_if(evex.w || evex.brs, EXC_UD); + op_bytes = elem_bytes = 1 << (b & 1); + /* See the comment at the avx512_broadcast label. */ + op_mask |= !(b & 1 ? !(uint32_t)op_mask : !op_mask); + goto avx512f_no_sae; + + case X86EMUL_OPC_EVEX_66(0x0f38, 0x7a): /* vpbroadcastb r32,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x7b): /* vpbroadcastw r32,[xyz]mm{k} */ + host_and_vcpu_must_have(avx512bw); + generate_exception_if(evex.w, EXC_UD); + /* fall through */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x7c): /* vpbroadcast{d,q} reg,[xyz]mm{k} */ + generate_exception_if((ea.type != OP_REG || evex.brs || + evex.reg != 0xf || !evex.RX), + EXC_UD); + host_and_vcpu_must_have(avx512f); + avx512_vlen_check(false); + get_fpu(X86EMUL_FPU_zmm); + + opc = init_evex(stub); + opc[0] = b; + /* Convert GPR source to %rAX. */ + evex.b = 1; + if ( !mode_64bit() ) + evex.w = 0; + opc[1] = modrm & 0xf8; + insn_bytes = EVEX_PFX_BYTES + 2; + opc[2] = 0xc3; + + copy_EVEX(opc, evex); + invoke_stub("", "", "=g" (dummy) : "a" (src.val)); + + put_stub(stub); + ASSERT(!state->simd_size); + break; + case X86EMUL_OPC_VEX_66(0x0f38, 0x8c): /* vpmaskmov{d,q} mem,{x,y}mm,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x8e): /* vpmaskmov{d,q} {x,y}mm,{x,y}mm,mem */ generate_exception_if(ea.type != OP_MEM, EXC_UD);