From patchwork Fri Mar 15 10:46:04 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Jan Beulich X-Patchwork-Id: 10854479 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 6CF6615AC for ; Fri, 15 Mar 2019 10:47:50 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 511292A934 for ; Fri, 15 Mar 2019 10:47:50 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 456A42A937; Fri, 15 Mar 2019 10:47:50 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-5.2 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_MED autolearn=ham version=3.3.1 Received: from lists.xenproject.org (lists.xenproject.org [192.237.175.120]) (using TLSv1.2 with cipher AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.wl.linuxfoundation.org (Postfix) with ESMTPS id 659F62A934 for ; Fri, 15 Mar 2019 10:47:49 +0000 (UTC) Received: from localhost ([127.0.0.1] helo=lists.xenproject.org) by lists.xenproject.org with esmtp (Exim 4.89) (envelope-from ) id 1h4kLp-0005MQ-8n; Fri, 15 Mar 2019 10:46:09 +0000 Received: from all-amaz-eas1.inumbo.com ([34.197.232.57] helo=us1-amaz-eas2.inumbo.com) by lists.xenproject.org with esmtp (Exim 4.89) (envelope-from ) id 1h4kLn-0005M6-Ly for xen-devel@lists.xenproject.org; Fri, 15 Mar 2019 10:46:07 +0000 X-Inumbo-ID: 87f83496-470f-11e9-84e6-a7c41e3a5e29 Received: from prv1-mh.provo.novell.com (unknown [137.65.248.33]) by us1-amaz-eas2.inumbo.com (Halon) with ESMTPS id 87f83496-470f-11e9-84e6-a7c41e3a5e29; Fri, 15 Mar 2019 10:46:04 +0000 (UTC) Received: from INET-PRV1-MTA by prv1-mh.provo.novell.com with Novell_GroupWise; Fri, 15 Mar 2019 04:46:03 -0600 Message-Id: <5C8B826C020000780021F1C1@prv1-mh.provo.novell.com> X-Mailer: Novell GroupWise Internet Agent 18.1.0 Date: Fri, 15 Mar 2019 04:46:04 -0600 From: "Jan Beulich" To: "xen-devel" References: <5B6BF83602000078001DC548@prv1-mh.provo.novell.com> <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> In-Reply-To: <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> Mime-Version: 1.0 Content-Disposition: inline Subject: [Xen-devel] [PATCH v8 17/50] x86emul: support AVX512{F, BW, _VBMI} permute insns X-BeenThere: xen-devel@lists.xenproject.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: Xen developer discussion List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Cc: George Dunlap , Andrew Cooper , Wei Liu , Roger Pau Monne Errors-To: xen-devel-bounces@lists.xenproject.org Sender: "Xen-devel" X-Virus-Scanned: ClamAV using ClamSMTP Signed-off-by: Jan Beulich Acked-by: Andrew Cooper --- v7: Re-base. v5: Re-base over changes earlier in the series. v4: New. --- a/tools/tests/x86_emulator/evex-disp8.c +++ b/tools/tests/x86_emulator/evex-disp8.c @@ -178,6 +178,10 @@ static const struct test avx512f_all[] = INSN(pcmpu, 66, 0f3a, 1e, vl, dq, vl), INSN(permi2, 66, 0f38, 76, vl, dq, vl), INSN(permi2, 66, 0f38, 77, vl, sd, vl), + INSN(permilpd, 66, 0f38, 0d, vl, q, vl), + INSN(permilpd, 66, 0f3a, 05, vl, q, vl), + INSN(permilps, 66, 0f38, 0c, vl, d, vl), + INSN(permilps, 66, 0f3a, 04, vl, d, vl), INSN(permt2, 66, 0f38, 7e, vl, dq, vl), INSN(permt2, 66, 0f38, 7f, vl, sd, vl), INSN(pmaxs, 66, 0f38, 3d, vl, dq, vl), @@ -278,6 +282,10 @@ static const struct test avx512f_no128[] INSN(extracti32x4, 66, 0f3a, 39, el_4, d, vl), INSN(insertf32x4, 66, 0f3a, 18, el_4, d, vl), INSN(inserti32x4, 66, 0f3a, 38, el_4, d, vl), + INSN(perm, 66, 0f38, 36, vl, dq, vl), + INSN(perm, 66, 0f38, 16, vl, sd, vl), + INSN(permpd, 66, 0f3a, 01, vl, q, vl), + INSN(permq, 66, 0f3a, 00, vl, q, vl), INSN(shuff32x4, 66, 0f3a, 23, vl, d, vl), INSN(shuff64x2, 66, 0f3a, 23, vl, q, vl), INSN(shufi32x4, 66, 0f3a, 43, vl, d, vl), @@ -316,6 +324,7 @@ static const struct test avx512bw_all[] INSN(pcmpgtb, 66, 0f, 64, vl, b, vl), INSN(pcmpgtw, 66, 0f, 65, vl, w, vl), INSN(pcmpu, 66, 0f3a, 3e, vl, bw, vl), + INSN(permw, 66, 0f38, 8d, vl, w, vl), INSN(permi2w, 66, 0f38, 75, vl, w, vl), INSN(permt2w, 66, 0f38, 7d, vl, w, vl), INSN(pmaddwd, 66, 0f, f5, vl, w, vl), @@ -412,6 +421,7 @@ static const struct test avx512dq_512[] }; static const struct test avx512_vbmi_all[] = { + INSN(permb, 66, 0f38, 8d, vl, b, vl), INSN(permi2b, 66, 0f38, 75, vl, b, vl), INSN(permt2b, 66, 0f38, 7d, vl, b, vl), }; --- a/tools/tests/x86_emulator/simd.c +++ b/tools/tests/x86_emulator/simd.c @@ -186,6 +186,7 @@ static inline bool _to_bool(byte_vec_t b # define interleave_hi(x, y) B(unpckhps, _mask, x, y, undef(), ~0) # define interleave_lo(x, y) B(unpcklps, _mask, x, y, undef(), ~0) # define swap(x) B(shufps, _mask, x, x, 0b00011011, undef(), ~0) +# define swap2(x) B_(vpermilps, _mask, x, 0b00011011, undef(), ~0) # else # define broadcast_quartet(x) B(broadcastf32x4_, _mask, x, undef(), ~0) # define insert_pair(x, y, p) \ @@ -200,6 +201,10 @@ static inline bool _to_bool(byte_vec_t b vec_t t_ = B(shuf_f32x4_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \ B(shufps, _mask, t_, t_, 0b00011011, undef(), ~0); \ }) +# define swap2(x) B(vpermilps, _mask, \ + B(shuf_f32x4_, _mask, x, x, \ + VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \ + 0b00011011, undef(), ~0) # endif # elif FLOAT_SIZE == 8 # if VEC_SIZE >= 32 @@ -233,6 +238,7 @@ static inline bool _to_bool(byte_vec_t b # define interleave_hi(x, y) B(unpckhpd, _mask, x, y, undef(), ~0) # define interleave_lo(x, y) B(unpcklpd, _mask, x, y, undef(), ~0) # define swap(x) B(shufpd, _mask, x, x, 0b01, undef(), ~0) +# define swap2(x) B_(vpermilpd, _mask, x, 0b01, undef(), ~0) # else # define interleave_hi(x, y) B(vpermi2varpd, _mask, x, interleave_hi, y, ~0) # define interleave_lo(x, y) B(vpermt2varpd, _mask, interleave_lo, x, y, ~0) @@ -240,6 +246,10 @@ static inline bool _to_bool(byte_vec_t b vec_t t_ = B(shuf_f64x2_, _mask, x, x, VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0); \ B(shufpd, _mask, t_, t_, 0b01010101, undef(), ~0); \ }) +# define swap2(x) B(vpermilpd, _mask, \ + B(shuf_f64x2_, _mask, x, x, \ + VEC_SIZE == 32 ? 0b01 : 0b00011011, undef(), ~0), \ + 0b01010101, undef(), ~0) # endif # endif #elif FLOAT_SIZE == 4 && defined(__SSE__) @@ -405,6 +415,7 @@ static inline bool _to_bool(byte_vec_t b B(shuf_i32x4_, _mask, (vsi_t)(x), (vsi_t)(x), \ VEC_SIZE == 32 ? 0b01 : 0b00011011, (vsi_t)undef(), ~0), \ 0b00011011, (vsi_t)undef(), ~0)) +# define swap2(x) ((vec_t)B_(permvarsi, _mask, (vsi_t)(x), (vsi_t)(inv - 1), (vsi_t)undef(), ~0)) # endif # define mix(x, y) ((vec_t)B(movdqa32_, _mask, (vsi_t)(x), (vsi_t)(y), \ (0b0101010101010101 & ((1 << ELEM_COUNT) - 1)))) @@ -442,8 +453,17 @@ static inline bool _to_bool(byte_vec_t b (vsi_t)B(shuf_i64x2_, _mask, (vdi_t)(x), (vdi_t)(x), \ VEC_SIZE == 32 ? 0b01 : 0b00011011, (vdi_t)undef(), ~0), \ 0b01001110, (vsi_t)undef(), ~0)) +# define swap2(x) ((vec_t)B(permvardi, _mask, (vdi_t)(x), (vdi_t)(inv - 1), (vdi_t)undef(), ~0)) # endif # define mix(x, y) ((vec_t)B(movdqa64_, _mask, (vdi_t)(x), (vdi_t)(y), 0b01010101)) +# if VEC_SIZE == 32 +# define swap3(x) ((vec_t)B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0)) +# elif VEC_SIZE == 64 +# define swap3(x) ({ \ + vdi_t t_ = B_(permdi, _mask, (vdi_t)(x), 0b00011011, (vdi_t)undef(), ~0); \ + B(shuf_i64x2_, _mask, t_, t_, 0b01001110, (vdi_t)undef(), ~0); \ +}) +# endif # endif # if INT_SIZE == 4 # define max(x, y) B(pmaxsd, _mask, x, y, undef(), ~0) @@ -489,6 +509,9 @@ static inline bool _to_bool(byte_vec_t b # define shrink1(x) ((half_t)B(pmovwb, _mask, (vhi_t)(x), (vqi_half_t){}, ~0)) # define shrink2(x) ((quarter_t)B(pmovdb, _mask, (vsi_t)(x), (vqi_quarter_t){}, ~0)) # define shrink3(x) ((eighth_t)B(pmovqb, _mask, (vdi_t)(x), (vqi_eighth_t){}, ~0)) +# ifdef __AVX512VBMI__ +# define swap2(x) ((vec_t)B(permvarqi, _mask, (vqi_t)(x), (vqi_t)(inv - 1), (vqi_t)undef(), ~0)) +# endif # elif INT_SIZE == 2 || UINT_SIZE == 2 # define broadcast(x) ({ \ vec_t t_; \ @@ -517,6 +540,7 @@ static inline bool _to_bool(byte_vec_t b (0b01010101010101010101010101010101 & ALL_TRUE))) # define shrink1(x) ((half_t)B(pmovdw, _mask, (vsi_t)(x), (vhi_half_t){}, ~0)) # define shrink2(x) ((quarter_t)B(pmovqw, _mask, (vdi_t)(x), (vhi_quarter_t){}, ~0)) +# define swap2(x) ((vec_t)B(permvarhi, _mask, (vhi_t)(x), (vhi_t)(inv - 1), (vhi_t)undef(), ~0)) # endif # if INT_SIZE == 1 # define max(x, y) ((vec_t)B(pmaxsb, _mask, (vqi_t)(x), (vqi_t)(y), (vqi_t)undef(), ~0)) @@ -1325,6 +1349,12 @@ int simd_test(void) if ( !eq(swap2(src), inv) ) return __LINE__; #endif +#ifdef swap3 + touch(src); + if ( !eq(swap3(src), inv) ) return __LINE__; + touch(src); +#endif + #ifdef broadcast if ( !eq(broadcast(ELEM_COUNT + 1), src + inv) ) return __LINE__; #endif --- a/tools/tests/x86_emulator/simd.h +++ b/tools/tests/x86_emulator/simd.h @@ -275,6 +275,8 @@ OVR(movlps); OVR_VFP(movnt); OVR_VFP(movu); OVR_FP(mul); +OVR_VFP(perm); +OVR_VFP(permil); OVR_VFP(shuf); OVR_INT(sll); OVR_DQ(sllv); @@ -331,6 +333,8 @@ OVR(movntdq); OVR(movntdqa); OVR(movshdup); OVR(movsldup); +OVR(permd); +OVR(permq); OVR(pmovsxbd); OVR(pmovsxbq); OVR(pmovsxdq); --- a/xen/arch/x86/x86_emulate/x86_emulate.c +++ b/xen/arch/x86/x86_emulate/x86_emulate.c @@ -434,7 +434,8 @@ static const struct ext0f38_table { } ext0f38_table[256] = { [0x00] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x01 ... 0x0b] = { .simd_size = simd_packed_int }, - [0x0c ... 0x0f] = { .simd_size = simd_packed_fp }, + [0x0c ... 0x0d] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, + [0x0e ... 0x0f] = { .simd_size = simd_packed_fp }, [0x10 ... 0x12] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x13] = { .simd_size = simd_other, .two_op = 1 }, [0x14 ... 0x16] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, @@ -477,6 +478,7 @@ static const struct ext0f38_table { [0x7d ... 0x7e] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x7f] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, [0x8c] = { .simd_size = simd_packed_int }, + [0x8d] = { .simd_size = simd_packed_int, .d8s = d8s_vl }, [0x8e] = { .simd_size = simd_packed_int, .to_mem = 1 }, [0x90 ... 0x93] = { .simd_size = simd_other, .vsib = 1 }, [0x96 ... 0x98] = { .simd_size = simd_packed_fp, .d8s = d8s_vl }, @@ -522,10 +524,10 @@ static const struct ext0f3a_table { uint8_t four_op:1; disp8scale_t d8s:4; } ext0f3a_table[256] = { - [0x00] = { .simd_size = simd_packed_int, .two_op = 1 }, - [0x01] = { .simd_size = simd_packed_fp, .two_op = 1 }, + [0x00] = { .simd_size = simd_packed_int, .two_op = 1, .d8s = d8s_vl }, + [0x01] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl }, [0x02] = { .simd_size = simd_packed_int }, - [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1 }, + [0x04 ... 0x05] = { .simd_size = simd_packed_fp, .two_op = 1, .d8s = d8s_vl }, [0x06] = { .simd_size = simd_packed_fp }, [0x08 ... 0x09] = { .simd_size = simd_packed_fp, .two_op = 1 }, [0x0a ... 0x0b] = { .simd_size = simd_scalar_opc }, @@ -8102,6 +8104,9 @@ x86_emulate( case X86EMUL_OPC_EVEX_66(0x0f, 0xf2): /* vpslld xmm/m128,[xyz]mm,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f, 0xf3): /* vpsllq xmm/m128,[xyz]mm,[xyz]mm{k} */ generate_exception_if(evex.brs, EXC_UD); + /* fall through */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x0c): /* vpermilps [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x0d): /* vpermilpd [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ fault_suppression = false; if ( b == 0xe2 ) goto avx512f_no_sae; @@ -8447,6 +8452,12 @@ x86_emulate( generate_exception_if(!vex.l || vex.w, EXC_UD); goto simd_0f_avx2; + case X86EMUL_OPC_EVEX_66(0x0f38, 0x16): /* vpermp{s,d} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x36): /* vperm{d,q} {y,z}mm/mem,{y,z}mm,{y,z}mm{k} */ + generate_exception_if(!evex.lr, EXC_UD); + fault_suppression = false; + goto avx512f_no_sae; + case X86EMUL_OPC_VEX_66(0x0f38, 0x20): /* vpmovsxbw xmm/mem,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x21): /* vpmovsxbd xmm/mem,{x,y}mm */ case X86EMUL_OPC_VEX_66(0x0f38, 0x22): /* vpmovsxbq xmm/mem,{x,y}mm */ @@ -8652,6 +8663,7 @@ x86_emulate( case X86EMUL_OPC_EVEX_66(0x0f38, 0x75): /* vpermi2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ case X86EMUL_OPC_EVEX_66(0x0f38, 0x7d): /* vpermt2{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f38, 0x8d): /* vperm{b,w} [xyz]mm/mem,[xyz]mm,[xyz]mm{k} */ if ( !evex.w ) host_and_vcpu_must_have(avx512_vbmi); else @@ -9077,6 +9089,12 @@ x86_emulate( generate_exception_if(!vex.l || !vex.w, EXC_UD); goto simd_0f_imm8_avx2; + case X86EMUL_OPC_EVEX_66(0x0f3a, 0x00): /* vpermq $imm8,{y,z}mm/mem,{y,z}mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f3a, 0x01): /* vpermpd $imm8,{y,z}mm/mem,{y,z}mm{k} */ + generate_exception_if(!evex.lr || !evex.w, EXC_UD); + fault_suppression = false; + goto avx512f_imm8_no_sae; + case X86EMUL_OPC_VEX_66(0x0f3a, 0x38): /* vinserti128 $imm8,xmm/m128,ymm,ymm */ case X86EMUL_OPC_VEX_66(0x0f3a, 0x39): /* vextracti128 $imm8,ymm,xmm/m128 */ case X86EMUL_OPC_VEX_66(0x0f3a, 0x46): /* vperm2i128 $imm8,ymm/m256,ymm,ymm */ @@ -9096,6 +9114,12 @@ x86_emulate( generate_exception_if(vex.w, EXC_UD); goto simd_0f_imm8_avx; + case X86EMUL_OPC_EVEX_66(0x0f3a, 0x04): /* vpermilps $imm8,[xyz]mm/mem,[xyz]mm{k} */ + case X86EMUL_OPC_EVEX_66(0x0f3a, 0x05): /* vpermilpd $imm8,[xyz]mm/mem,[xyz]mm{k} */ + generate_exception_if(evex.w != (b & 1), EXC_UD); + fault_suppression = false; + goto avx512f_imm8_no_sae; + case X86EMUL_OPC_66(0x0f3a, 0x08): /* roundps $imm8,xmm/m128,xmm */ case X86EMUL_OPC_66(0x0f3a, 0x09): /* roundpd $imm8,xmm/m128,xmm */ case X86EMUL_OPC_66(0x0f3a, 0x0a): /* roundss $imm8,xmm/m128,xmm */