From patchwork Fri Mar 15 11:08:05 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: Jan Beulich X-Patchwork-Id: 10854559 Return-Path: Received: from mail.wl.linuxfoundation.org (pdx-wl-mail.web.codeaurora.org [172.30.200.125]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 3C6D314DE for ; Fri, 15 Mar 2019 11:09:55 +0000 (UTC) Received: from mail.wl.linuxfoundation.org (localhost [127.0.0.1]) by mail.wl.linuxfoundation.org (Postfix) with ESMTP id 1C8E22A94C for ; Fri, 15 Mar 2019 11:09:55 +0000 (UTC) Received: by mail.wl.linuxfoundation.org (Postfix, from userid 486) id 0F1222A94E; Fri, 15 Mar 2019 11:09:55 +0000 (UTC) X-Spam-Checker-Version: SpamAssassin 3.3.1 (2010-03-16) on pdx-wl-mail.web.codeaurora.org X-Spam-Level: X-Spam-Status: No, score=-5.2 required=2.0 tests=BAYES_00,MAILING_LIST_MULTI, RCVD_IN_DNSWL_MED autolearn=ham version=3.3.1 Received: from lists.xenproject.org (lists.xenproject.org [192.237.175.120]) (using TLSv1.2 with cipher AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mail.wl.linuxfoundation.org (Postfix) with ESMTPS id 0A1302A94C for ; Fri, 15 Mar 2019 11:09:54 +0000 (UTC) Received: from localhost ([127.0.0.1] helo=lists.xenproject.org) by lists.xenproject.org with esmtp (Exim 4.89) (envelope-from ) id 1h4khA-0002G2-VE; Fri, 15 Mar 2019 11:08:12 +0000 Received: from us1-rack-dfw2.inumbo.com ([104.130.134.6]) by lists.xenproject.org with esmtp (Exim 4.89) (envelope-from ) id 1h4kh8-0002Fg-Vq for xen-devel@lists.xenproject.org; Fri, 15 Mar 2019 11:08:11 +0000 X-Inumbo-ID: 9dac069d-4712-11e9-bc90-bc764e045a96 Received: from prv1-mh.provo.novell.com (unknown [137.65.248.33]) by us1-rack-dfw2.inumbo.com (Halon) with ESMTPS id 9dac069d-4712-11e9-bc90-bc764e045a96; Fri, 15 Mar 2019 11:08:09 +0000 (UTC) Received: from INET-PRV1-MTA by prv1-mh.provo.novell.com with Novell_GroupWise; Fri, 15 Mar 2019 05:08:08 -0600 Message-Id: <5C8B8795020000780021F32C@prv1-mh.provo.novell.com> X-Mailer: Novell GroupWise Internet Agent 18.1.0 Date: Fri, 15 Mar 2019 05:08:05 -0600 From: "Jan Beulich" To: "xen-devel" References: <5B6BF83602000078001DC548@prv1-mh.provo.novell.com> <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> In-Reply-To: <5C8B7EC0020000780021F10B@prv1-mh.provo.novell.com> Mime-Version: 1.0 Content-Disposition: inline Subject: [Xen-devel] [PATCH v8 49/50] x86emul: add a SHA test case to the harness X-BeenThere: xen-devel@lists.xenproject.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: Xen developer discussion List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Cc: George Dunlap , Andrew Cooper , Wei Liu , Roger Pau Monne Errors-To: xen-devel-bounces@lists.xenproject.org Sender: "Xen-devel" X-Virus-Scanned: ClamAV using ClamSMTP Also use this for AVX512VL VPRO{L,R}{,V}D as well as some further shifts testing. Signed-off-by: Jan Beulich --- v8: New. --- a/tools/tests/x86_emulator/Makefile +++ b/tools/tests/x86_emulator/Makefile @@ -20,8 +20,9 @@ SIMD := 3dnow sse sse2 sse4 avx avx2 xop FMA := fma4 fma SG := avx2-sg avx512f-sg avx512vl-sg AES := ssse3-aes avx-aes avx2-vaes avx512bw-vaes +SHA := sse4-sha avx-sha avx512f-sha GF := sse2-gf avx2-gf avx512bw-gf -TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(AES) $(GF) +TESTCASES := blowfish $(SIMD) $(FMA) $(SG) $(AES) $(SHA) $(GF) OPMASK := avx512f avx512dq avx512bw @@ -148,6 +149,10 @@ define simd-aes-defs $(1)-cflags := $(foreach vec,$($(patsubst %-aes,sse,$(1))-vecs) $($(patsubst %-vaes,%,$(1))-vecs), \ "-D_$(vec) -maes $(addprefix -m,$(subst -,$(space),$(1))) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)") endef +define simd-sha-defs +$(1)-cflags := $(foreach vec,$(sse-vecs), \ + "-D_$(vec) $(addprefix -m,$(subst -,$(space),$(1))) -Os -DVEC_SIZE=$(vec)") +endef define simd-gf-defs $(1)-cflags := $(foreach vec,$($(1:-gf=)-vecs), \ "-D_$(vec) -mgfni -m$(1:-gf=) $(call non-sse,$(1)) -Os -DVEC_SIZE=$(vec)") @@ -159,6 +164,7 @@ endef $(foreach flavor,$(SIMD) $(FMA),$(eval $(call simd-defs,$(flavor)))) $(foreach flavor,$(SG),$(eval $(call simd-sg-defs,$(flavor)))) $(foreach flavor,$(AES),$(eval $(call simd-aes-defs,$(flavor)))) +$(foreach flavor,$(SHA),$(eval $(call simd-sha-defs,$(flavor)))) $(foreach flavor,$(GF),$(eval $(call simd-gf-defs,$(flavor)))) $(foreach flavor,$(OPMASK),$(eval $(call opmask-defs,$(flavor)))) @@ -212,10 +218,13 @@ $(addsuffix .c,$(SG)): $(addsuffix .c,$(AES)): ln -sf simd-aes.c $@ +$(addsuffix .c,$(SHA)): + ln -sf simd-sha.c $@ + $(addsuffix .c,$(GF)): ln -sf simd-gf.c $@ -$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(GF)): simd.h +$(addsuffix .h,$(SIMD) $(FMA) $(SG) $(AES) $(SHA) $(GF)): simd.h xop.h avx512f.h: simd-fma.c --- /dev/null +++ b/tools/tests/x86_emulator/simd-sha.c @@ -0,0 +1,392 @@ +#define INT_SIZE 4 + +#include "simd.h" +ENTRY(sha_test); + +#define SHA(op, a...) __builtin_ia32_sha ## op(a) + +#ifdef __AVX512F__ +# define ALL_TRUE (~0ULL >> (64 - ELEM_COUNT)) +# define eq(x, y) (B(pcmpeqd, _mask, x, y, -1) == ALL_TRUE) +# define blend(x, y, sel) B(movdqa32_, _mask, y, x, sel) +# define rot_c(f, r, x, n) B(pro ## f ## d, _mask, x, n, undef(), ~0) +# define rot_s(f, r, x, n) ({ /* gcc does not support embedded broadcast */ \ + vec_t r_; \ + asm ( "vpro" #f "vd %2%{1to%c3%}, %1, %0" \ + : "=v" (r_) \ + : "v" (x), "m" (n), "i" (ELEM_COUNT) ); \ + r_; \ +}) +# define rot_v(d, x, n) B(pro ## d ## vd, _mask, x, n, undef(), ~0) +# define shift_s(d, x, n) ({ \ + vec_t r_; \ + asm ( "vps" #d "lvd %2%{1to%c3%}, %1, %0" \ + : "=v" (r_) \ + : "v" (x), "m" (n), "i" (ELEM_COUNT) ); \ + r_; \ +}) +# define vshift(d, x, n) ({ /* gcc does not allow memory operands */ \ + vec_t r_; \ + asm ( "vps" #d "ldq %2, %1, %0" \ + : "=v" (r_) : "m" (x), "i" ((n) * ELEM_SIZE) ); \ + r_; \ +}) +#else +# define to_bool(cmp) (__builtin_ia32_pmovmskb128(cmp) == 0xffff) +# define eq(x, y) to_bool((x) == (y)) +# define blend(x, y, sel) \ + ((vec_t)__builtin_ia32_pblendw128((vhi_t)(x), (vhi_t)(y), \ + ((sel) & 1 ? 0x03 : 0) | \ + ((sel) & 2 ? 0x0c : 0) | \ + ((sel) & 4 ? 0x30 : 0) | \ + ((sel) & 8 ? 0xc0 : 0))) +# define rot_c(f, r, x, n) (sh ## f ## _c(x, n) | sh ## r ## _c(x, 32 - (n))) +# define rot_s(f, r, x, n) ({ /* gcc does not allow memory operands */ \ + vec_t r_, t_, n_ = (vec_t){ 32 } - (n); \ + asm ( "ps" #f "ld %2, %0; ps" #r "ld %3, %1; por %1, %0" \ + : "=&x" (r_), "=&x" (t_) \ + : "m" (n), "m" (n_), "0" (x), "1" (x) ); \ + r_; \ +}) +static inline unsigned int rotl(unsigned int x, unsigned int n) +{ + return (x << (n & 0x1f)) | (x >> ((32 - n) & 0x1f)); +} +static inline unsigned int rotr(unsigned int x, unsigned int n) +{ + return (x >> (n & 0x1f)) | (x << ((32 - n) & 0x1f)); +} +# define rot_v(d, x, n) ({ \ + vec_t t_; \ + unsigned int i_; \ + for ( i_ = 0; i_ < ELEM_COUNT; ++i_ ) \ + t_[i_] = rot ## d((x)[i_], (n)[i_]); \ + t_; \ +}) +# define shift_s(d, x, n) ({ \ + vec_t r_; \ + asm ( "ps" #d "ld %1, %0" : "=&x" (r_) : "m" (n), "0" (x) ); \ + r_; \ +}) +# define vshift(d, x, n) \ + (vec_t)(__builtin_ia32_ps ## d ## ldqi128((vdi_t)(x), (n) * ELEM_SIZE * 8)) +#endif + +#define alignr(x, y, n) ((vec_t)__builtin_ia32_palignr128((vdi_t)(x), (vdi_t)(y), (n) * 8)) +#define hadd(x, y) __builtin_ia32_phaddd128(x, y) +#define rol_c(x, n) rot_c(l, r, x, n) +#define rol_s(x, n) rot_s(l, r, x, n) +#define rol_v(x, n...) rot_v(l, x, n) +#define ror_c(x, n) rot_c(r, l, x, n) +#define ror_s(x, n) rot_s(r, l, x, n) +#define ror_v(x, n...) rot_v(r, x, n) +#define shl_c(x, n) __builtin_ia32_pslldi128(x, n) +#define shl_s(x, n) shift_s(l, x, n) +#define shr_c(x, n) __builtin_ia32_psrldi128(x, n) +#define shr_s(x, n) shift_s(r, x, n) +#define shuf(x, s) __builtin_ia32_pshufd(x, s) +#define swap(x) shuf(x, 0b00011011) +#define vshl(x, n) vshift(l, x, n) +#define vshr(x, n) vshift(r, x, n) + +static inline vec_t sha256_sigma0(vec_t w) +{ + vec_t res; + + touch(w); + res = ror_c(w, 7); + touch(w); + res ^= rol_c(w, 14); + touch(w); + res ^= shr_c(w, 3); + touch(w); + + return res; +} + +static inline vec_t sha256_sigma1(vec_t w) +{ + vec_t _17 = { 17 }, _19 = { 19 }, _10 = { 10 }; + + return ror_s(w, _17) ^ ror_s(w, _19) ^ shr_s(w, _10); +} + +static inline vec_t sha256_Sigma0(vec_t w) +{ + vec_t res, n1 = { 0, 0, 2, 2 }, n2 = { 0, 0, 13, 13 }, n3 = { 0, 0, 10, 10 }; + + touch(n1); + res = ror_v(w, n1); + touch(n2); + res ^= ror_v(w, n2); + touch(n3); + + return res ^ rol_v(w, n3); +} + +static inline vec_t sha256_Sigma1(vec_t w) +{ + return ror_c(w, 6) ^ ror_c(w, 11) ^ rol_c(w, 7); +} + +int sha_test(void) +{ + unsigned int i; + vec_t src, one = { 1 }; + vqi_t raw = {}; + + for ( i = 1; i < VEC_SIZE; ++i ) + raw[i] = i; + src = (vec_t)raw; + + for ( i = 0; i < 256; i += VEC_SIZE ) + { + vec_t x, y, tmp, hash = -src; + vec_t a, b, c, d, e, g, h; + unsigned int k, r; + + touch(src); + x = SHA(1msg1, hash, src); + touch(src); + y = hash ^ alignr(hash, src, 8); + touch(src); + + if ( !eq(x, y) ) return __LINE__; + + touch(src); + x = SHA(1msg2, hash, src); + touch(src); + tmp = hash ^ alignr(src, hash, 12); + touch(tmp); + y = rol_c(tmp, 1); + tmp = hash ^ alignr(src, y, 12); + touch(tmp); + y = rol_c(tmp, 1); + + if ( !eq(x, y) ) return __LINE__; + + touch(src); + x = SHA(1msg2, hash, src); + touch(src); + tmp = rol_s(hash ^ alignr(src, hash, 12), one); + y = rol_s(hash ^ alignr(src, tmp, 12), one); + + if ( !eq(x, y) ) return __LINE__; + + touch(src); + x = SHA(1nexte, hash, src); + touch(src); + touch(hash); + tmp = rol_c(hash, 30); + tmp[2] = tmp[1] = tmp[0] = 0; + + if ( !eq(x, src + tmp) ) return __LINE__; + + /* + * SHA1RNDS4 + * + * SRC1 = { A0, B0, C0, D0 } + * SRC2 = W' = { W[0]E0, W[1], W[2], W[3] } + * + * (NB that the notation is not C-like, i.e. elements are listed + * high-to-low everywhere in this comment.) + * + * In order to pick a simple rounds function, an immediate value of + * 1 is used; 3 would also be a possibility. + * + * Applying + * + * A1 = ROL5(A0) + (B0 ^ C0 ^ D0) + W'[0] + K + * E1 = D0 + * D1 = C0 + * C1 = ROL30(B0) + * B1 = A0 + * + * iteratively four times and resolving round variable values to + * A and B0, C0, and D0 we get + * + * A4 = ROL5(A3) + (A2 ^ ROL30(A1) ^ ROL30(A0)) + W'[3] + ROL30(B0) + K + * A3 = ROL5(A2) + (A1 ^ ROL30(A0) ^ ROL30(B0)) + W'[2] + C0 + K + * A2 = ROL5(A1) + (A0 ^ ROL30(B0) ^ C0 ) + W'[1] + D0 + K + * A1 = ROL5(A0) + (B0 ^ C0 ^ D0 ) + W'[0] + K + * + * (respective per-column variable names: + * y a b c d src e k + * ) + * + * with + * + * B4 = A3 + * C4 = ROL30(A2) + * D4 = ROL30(A1) + * E4 = ROL30(A0) + * + * and hence + * + * DST = { A4, A3, ROL30(A2), ROL30(A1) } + */ + + touch(src); + x = SHA(1rnds4, hash, src, 1); + touch(src); + + a = vshr(hash, 3); + b = vshr(hash, 2); + touch(hash); + d = rol_c(hash, 30); + touch(hash); + d = blend(d, hash, 0b0011); + c = vshr(d, 1); + e = vshl(d, 1); + tmp = (vec_t){}; + k = rol_c(SHA(1rnds4, tmp, tmp, 1), 2)[0]; + + for ( r = 0; r < 4; ++r ) + { + y = rol_c(a, 5) + (b ^ c ^ d) + swap(src) + e + k; + + switch ( r ) + { + case 0: + c[3] = rol_c(y, 30)[0]; + /* fall through */ + case 1: + b[r + 2] = y[r]; + /* fall through */ + case 2: + a[r + 1] = y[r]; + break; + } + + switch ( r ) + { + case 3: + if ( a[3] != y[2] ) return __LINE__; + /* fall through */ + case 2: + if ( a[2] != y[1] ) return __LINE__; + if ( b[3] != y[1] ) return __LINE__; + /* fall through */ + case 1: + if ( a[1] != y[0] ) return __LINE__; + if ( b[2] != y[0] ) return __LINE__; + if ( c[3] != rol_c(y, 30)[0] ) return __LINE__; + break; + } + } + + a = blend(rol_c(y, 30), y, 0b1100); + + if ( !eq(x, a) ) return __LINE__; + + touch(src); + x = SHA(256msg1, hash, src); + touch(src); + y = hash + sha256_sigma0(alignr(src, hash, 4)); + + if ( !eq(x, y) ) return __LINE__; + + touch(src); + x = SHA(256msg2, hash, src); + touch(src); + tmp = hash + sha256_sigma1(alignr(hash, src, 8)); + y = hash + sha256_sigma1(alignr(tmp, src, 8)); + + if ( !eq(x, y) ) return __LINE__; + + /* + * SHA256RNDS2 + * + * SRC1 = { C0, D0, G0, H0 } + * SRC2 = { A0, B0, E0, F0 } + * XMM0 = W' = { ?, ?, WK1, WK0 } + * + * (NB that the notation again is not C-like, i.e. elements are listed + * high-to-low everywhere in this comment.) + * + * Ch(E,F,G) = (E & F) ^ (~E & G) + * Maj(A,B,C) = (A & B) ^ (A & C) ^ (B & C) + * + * Σ0(A) = ROR2(A) ^ ROR13(A) ^ ROR22(A) + * Σ1(E) = ROR6(E) ^ ROR11(E) ^ ROR25(E) + * + * Applying + * + * A1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + Maj(A0, B0, C0) + Σ0(A0) + * B1 = A0 + * C1 = B0 + * D1 = C0 + * E1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + D0 + * F1 = E0 + * G1 = F0 + * H1 = G0 + * + * iteratively four times and resolving round variable values to + * A / E and B0, C0, D0, F0, G0, and H0 we get + * + * A2 = Ch(E1, E0, F0) + Σ1(E1) + WK1 + G0 + Maj(A1, A0, B0) + Σ0(A1) + * A1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + Maj(A0, B0, C0) + Σ0(A0) + * E2 = Ch(E1, E0, F0) + Σ1(E1) + WK1 + G0 + C0 + * E1 = Ch(E0, F0, G0) + Σ1(E0) + WK0 + H0 + D0 + * + * with + * + * B2 = A1 + * F2 = E1 + * + * and hence + * + * DST = { A2, A1, E2, E1 } + * + * which we can simplify a little, by letting A0, B0, and E0 be zero + * and F0 = ~G0, and by then utilizing + * + * Ch(0, 0, x) = x + * Ch(x, 0, y) = ~x & y + * Maj(x, 0, 0) = Maj(0, x, 0) = Maj(0, 0, x) = 0 + * + * A2 = (~E1 & F0) + Σ1(E1) + WK1 + G0 + Σ0(A1) + * A1 = (~E0 & G0) + Σ1(E0) + WK0 + H0 + Σ0(A0) + * E2 = (~E1 & F0) + Σ1(E1) + WK1 + G0 + C0 + * E1 = (~E0 & G0) + Σ1(E0) + WK0 + H0 + D0 + * + * (respective per-column variable names: + * y e g e src h d + * ) + */ + + tmp = (vec_t){ ~hash[1] }; + touch(tmp); + x = SHA(256rnds2, hash, tmp, src); + touch(tmp); + + e = y = (vec_t){}; + d = alignr(y, hash, 8); + g = (vec_t){ hash[1], tmp[0], hash[1], tmp[0] }; + h = shuf(hash, 0b01000100); + + for ( r = 0; r < 2; ++r ) + { + y = (~e & g) + sha256_Sigma1(e) + shuf(src, 0b01000100) + + h + sha256_Sigma0(d); + + if ( !r ) + { + d[3] = y[2]; + e[3] = e[1] = y[0]; + } + else if ( d[3] != y[2] ) + return __LINE__; + else if ( e[1] != y[0] ) + return __LINE__; + else if ( e[3] != y[0] ) + return __LINE__; + } + + if ( !eq(x, y) ) return __LINE__; + + src += 0x01010101 * VEC_SIZE; + } + + return 0; +} --- a/tools/tests/x86_emulator/test_x86_emulator.c +++ b/tools/tests/x86_emulator/test_x86_emulator.c @@ -14,8 +14,10 @@ asm ( ".pushsection .test, \"ax\", @prog #include "sse2-gf.h" #include "ssse3-aes.h" #include "sse4.h" +#include "sse4-sha.h" #include "avx.h" #include "avx-aes.h" +#include "avx-sha.h" #include "fma4.h" #include "fma.h" #include "avx2.h" @@ -28,6 +30,7 @@ asm ( ".pushsection .test, \"ax\", @prog #include "avx512bw-opmask.h" #include "avx512f.h" #include "avx512f-sg.h" +#include "avx512f-sha.h" #include "avx512vl-sg.h" #include "avx512bw.h" #include "avx512bw-vaes.h" @@ -155,6 +158,21 @@ static bool simd_check_avx512vbmi_vl(voi return cpu_has_avx512_vbmi && cpu_has_avx512vl; } +static bool simd_check_sse4_sha(void) +{ + return cpu_has_sha && cpu_has_sse4_2; +} + +static bool simd_check_avx_sha(void) +{ + return cpu_has_sha && cpu_has_avx; +} + +static bool simd_check_avx512f_sha_vl(void) +{ + return cpu_has_sha && cpu_has_avx512vl; +} + static bool simd_check_avx2_vaes(void) { return cpu_has_aesni && cpu_has_vaes && cpu_has_avx2; @@ -450,6 +468,9 @@ static const struct { AVX512VL(_VBMI+VL u16x8, avx512vbmi, 16u2), AVX512VL(_VBMI+VL s16x16, avx512vbmi, 32i2), AVX512VL(_VBMI+VL u16x16, avx512vbmi, 32u2), + SIMD(SHA, sse4_sha, 16), + SIMD(AVX+SHA, avx_sha, 16), + AVX512VL(VL+SHA, avx512f_sha, 16), SIMD(VAES (VEX/x32), avx2_vaes, 32), SIMD(VAES (EVEX/x64), avx512bw_vaes, 64), AVX512VL(VL+VAES (x16), avx512bw_vaes, 16), --- a/tools/tests/x86_emulator/x86-emulate.h +++ b/tools/tests/x86_emulator/x86-emulate.h @@ -142,6 +142,7 @@ static inline bool xcr0_mask(uint64_t ma #define cpu_has_avx512_ifma (cp.feat.avx512_ifma && xcr0_mask(0xe6)) #define cpu_has_avx512er (cp.feat.avx512er && xcr0_mask(0xe6)) #define cpu_has_avx512cd (cp.feat.avx512cd && xcr0_mask(0xe6)) +#define cpu_has_sha cp.feat.sha #define cpu_has_avx512bw (cp.feat.avx512bw && xcr0_mask(0xe6)) #define cpu_has_avx512vl (cp.feat.avx512vl && xcr0_mask(0xe6)) #define cpu_has_avx512_vbmi (cp.feat.avx512_vbmi && xcr0_mask(0xe6))