@@ -11,4 +11,5 @@ AFLAGS_chacha20-mips.o += -O2 # This is required to fill the branch delay slots
obj-$(CONFIG_ZINC_CHACHA20) += zinc_chacha20.o
zinc_poly1305-y := poly1305/poly1305.o
+zinc_poly1305-$(CONFIG_ZINC_ARCH_X86_64) += poly1305/poly1305-x86_64.o
obj-$(CONFIG_ZINC_POLY1305) += zinc_poly1305.o
new file mode 100644
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/*
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#include <asm/cpufeature.h>
+#include <asm/processor.h>
+#include <asm/intel-family.h>
+
+asmlinkage void poly1305_init_x86_64(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE]);
+asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit);
+asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4]);
+asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit);
+
+static bool poly1305_use_avx __ro_after_init;
+static bool poly1305_use_avx2 __ro_after_init;
+static bool poly1305_use_avx512 __ro_after_init;
+static bool *const poly1305_nobs[] __initconst = {
+ &poly1305_use_avx, &poly1305_use_avx2, &poly1305_use_avx512 };
+
+static void __init poly1305_fpu_init(void)
+{
+ poly1305_use_avx =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+ poly1305_use_avx2 =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+ poly1305_use_avx512 =
+ boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ boot_cpu_has(X86_FEATURE_AVX512F) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM |
+ XFEATURE_MASK_AVX512, NULL) &&
+ /* Skylake downclocks unacceptably much when using zmm. */
+ boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X;
+}
+
+static inline bool poly1305_init_arch(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE])
+{
+ poly1305_init_x86_64(ctx, key);
+ return true;
+}
+
+struct poly1305_arch_internal {
+ union {
+ struct {
+ u32 h[5];
+ u32 is_base2_26;
+ };
+ u64 hs[3];
+ };
+ u64 r[2];
+ u64 pad;
+ struct { u32 r2, r1, r4, r3; } rn[9];
+};
+
+/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
+ * the unfortunate situation of using AVX and then having to go back to scalar
+ * -- because the user is silly and has called the update function from two
+ * separate contexts -- then we need to convert back to the original base before
+ * proceeding. It is possible to reason that the initial reduction below is
+ * sufficient given the implementation invariants. However, for an avoidance of
+ * doubt and because this is not performance critical, we do the full reduction
+ * anyway.
+ */
+static void convert_to_base2_64(void *ctx)
+{
+ struct poly1305_arch_internal *state = ctx;
+ u32 cy;
+
+ if (!state->is_base2_26)
+ return;
+
+ cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
+ cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
+ cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
+ cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
+ state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
+ state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
+ state->hs[2] = state->h[4] >> 24;
+#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+ cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
+ state->hs[2] &= 3;
+ state->hs[0] += cy;
+ state->hs[1] += (cy = ULT(state->hs[0], cy));
+ state->hs[2] += ULT(state->hs[1], cy);
+#undef ULT
+ state->is_base2_26 = 0;
+}
+
+static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
+ size_t len, const u32 padbit,
+ simd_context_t *simd_context)
+{
+ struct poly1305_arch_internal *state = ctx;
+
+ /* SIMD disables preemption, so relax after processing each page. */
+ BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
+ PAGE_SIZE % POLY1305_BLOCK_SIZE);
+
+ if (!IS_ENABLED(CONFIG_AS_AVX) || !poly1305_use_avx ||
+ (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
+ !simd_use(simd_context)) {
+ convert_to_base2_64(ctx);
+ poly1305_blocks_x86_64(ctx, inp, len, padbit);
+ return true;
+ }
+
+ for (;;) {
+ const size_t bytes = min_t(size_t, len, PAGE_SIZE);
+
+ if (IS_ENABLED(CONFIG_AS_AVX512) && poly1305_use_avx512)
+ poly1305_blocks_avx512(ctx, inp, bytes, padbit);
+ else if (IS_ENABLED(CONFIG_AS_AVX2) && poly1305_use_avx2)
+ poly1305_blocks_avx2(ctx, inp, bytes, padbit);
+ else
+ poly1305_blocks_avx(ctx, inp, bytes, padbit);
+ len -= bytes;
+ if (!len)
+ break;
+ inp += bytes;
+ simd_relax(simd_context);
+ }
+
+ return true;
+}
+
+static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
+ const u32 nonce[4],
+ simd_context_t *simd_context)
+{
+ struct poly1305_arch_internal *state = ctx;
+
+ if (!IS_ENABLED(CONFIG_AS_AVX) || !poly1305_use_avx ||
+ !state->is_base2_26 || !simd_use(simd_context)) {
+ convert_to_base2_64(ctx);
+ poly1305_emit_x86_64(ctx, mac, nonce);
+ } else
+ poly1305_emit_avx(ctx, mac, nonce);
+ return true;
+}
similarity index 58%
rename from lib/zinc/poly1305/poly1305-x86_64-cryptogams.S
rename to lib/zinc/poly1305/poly1305-x86_64.S
@@ -1,22 +1,27 @@
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
/*
+ * Copyright (C) 2017 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ *
+ * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS.
*/
-.text
-
+#include <linux/linkage.h>
+.section .rodata.cst192.Lconst, "aM", @progbits, 192
+.align 64
+.Lconst:
+.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.long 16777216,0,16777216,0,16777216,0,16777216,0
+.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.long 2,2,2,3,2,0,2,1
+.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
-.globl poly1305_init
-.hidden poly1305_init
-.globl poly1305_blocks
-.hidden poly1305_blocks
-.globl poly1305_emit
-.hidden poly1305_emit
+.text
-.type poly1305_init,@function
.align 32
-poly1305_init:
+ENTRY(poly1305_init_x86_64)
xorq %rax,%rax
movq %rax,0(%rdi)
movq %rax,8(%rdi)
@@ -25,61 +30,30 @@ poly1305_init:
cmpq $0,%rsi
je .Lno_key
- leaq poly1305_blocks(%rip),%r10
- leaq poly1305_emit(%rip),%r11
- movq OPENSSL_ia32cap_P+4(%rip),%r9
- leaq poly1305_blocks_avx(%rip),%rax
- leaq poly1305_emit_avx(%rip),%rcx
- btq $28,%r9
- cmovcq %rax,%r10
- cmovcq %rcx,%r11
- leaq poly1305_blocks_avx2(%rip),%rax
- btq $37,%r9
- cmovcq %rax,%r10
- movq $2149646336,%rax
- shrq $32,%r9
- andq %rax,%r9
- cmpq %rax,%r9
- je .Linit_base2_44
movq $0x0ffffffc0fffffff,%rax
movq $0x0ffffffc0ffffffc,%rcx
andq 0(%rsi),%rax
andq 8(%rsi),%rcx
movq %rax,24(%rdi)
movq %rcx,32(%rdi)
- movq %r10,0(%rdx)
- movq %r11,8(%rdx)
movl $1,%eax
.Lno_key:
- .byte 0xf3,0xc3
-.size poly1305_init,.-poly1305_init
+ ret
+ENDPROC(poly1305_init_x86_64)
-.type poly1305_blocks,@function
.align 32
-poly1305_blocks:
-.cfi_startproc
+ENTRY(poly1305_blocks_x86_64)
.Lblocks:
shrq $4,%rdx
jz .Lno_data
pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-24
pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
+ pushq %rdi
+
.Lblocks_body:
movq %rdx,%r15
@@ -89,7 +63,7 @@ poly1305_blocks:
movq 0(%rdi),%r14
movq 8(%rdi),%rbx
- movq 16(%rdi),%rbp
+ movq 16(%rdi),%r10
movq %r13,%r12
shrq $2,%r13
@@ -99,14 +73,15 @@ poly1305_blocks:
.align 32
.Loop:
+
addq 0(%rsi),%r14
adcq 8(%rsi),%rbx
leaq 16(%rsi),%rsi
- adcq %rcx,%rbp
+ adcq %rcx,%r10
mulq %r14
movq %rax,%r9
movq %r11,%rax
- movq %rdx,%r10
+ movq %rdx,%rdi
mulq %r14
movq %rax,%r14
@@ -116,62 +91,55 @@ poly1305_blocks:
mulq %rbx
addq %rax,%r9
movq %r13,%rax
- adcq %rdx,%r10
+ adcq %rdx,%rdi
mulq %rbx
- movq %rbp,%rbx
+ movq %r10,%rbx
addq %rax,%r14
adcq %rdx,%r8
imulq %r13,%rbx
addq %rbx,%r9
movq %r8,%rbx
- adcq $0,%r10
+ adcq $0,%rdi
- imulq %r11,%rbp
+ imulq %r11,%r10
addq %r9,%rbx
movq $-4,%rax
- adcq %rbp,%r10
+ adcq %r10,%rdi
- andq %r10,%rax
- movq %r10,%rbp
- shrq $2,%r10
- andq $3,%rbp
- addq %r10,%rax
+ andq %rdi,%rax
+ movq %rdi,%r10
+ shrq $2,%rdi
+ andq $3,%r10
+ addq %rdi,%rax
addq %rax,%r14
adcq $0,%rbx
- adcq $0,%rbp
+ adcq $0,%r10
+
movq %r12,%rax
decq %r15
jnz .Loop
+ movq 0(%rsp),%rdi
+
movq %r14,0(%rdi)
movq %rbx,8(%rdi)
- movq %rbp,16(%rdi)
-
- movq 0(%rsp),%r15
-.cfi_restore %r15
- movq 8(%rsp),%r14
-.cfi_restore %r14
- movq 16(%rsp),%r13
-.cfi_restore %r13
- movq 24(%rsp),%r12
-.cfi_restore %r12
- movq 32(%rsp),%rbp
-.cfi_restore %rbp
+ movq %r10,16(%rdi)
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
movq 40(%rsp),%rbx
-.cfi_restore %rbx
leaq 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
.Lno_data:
.Lblocks_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
-.size poly1305_blocks,.-poly1305_blocks
+ ret
+ENDPROC(poly1305_blocks_x86_64)
-.type poly1305_emit,@function
.align 32
-poly1305_emit:
+ENTRY(poly1305_emit_x86_64)
.Lemit:
movq 0(%rdi),%r8
movq 8(%rdi),%r9
@@ -191,15 +159,14 @@ poly1305_emit:
movq %rax,0(%rsi)
movq %rcx,8(%rsi)
- .byte 0xf3,0xc3
-.size poly1305_emit,.-poly1305_emit
-.type __poly1305_block,@function
-.align 32
-__poly1305_block:
+ ret
+ENDPROC(poly1305_emit_x86_64)
+
+.macro __poly1305_block
mulq %r14
movq %rax,%r9
movq %r11,%rax
- movq %rdx,%r10
+ movq %rdx,%rdi
mulq %r14
movq %rax,%r14
@@ -209,45 +176,44 @@ __poly1305_block:
mulq %rbx
addq %rax,%r9
movq %r13,%rax
- adcq %rdx,%r10
+ adcq %rdx,%rdi
mulq %rbx
- movq %rbp,%rbx
+ movq %r10,%rbx
addq %rax,%r14
adcq %rdx,%r8
imulq %r13,%rbx
addq %rbx,%r9
movq %r8,%rbx
- adcq $0,%r10
+ adcq $0,%rdi
- imulq %r11,%rbp
+ imulq %r11,%r10
addq %r9,%rbx
movq $-4,%rax
- adcq %rbp,%r10
+ adcq %r10,%rdi
- andq %r10,%rax
- movq %r10,%rbp
- shrq $2,%r10
- andq $3,%rbp
- addq %r10,%rax
+ andq %rdi,%rax
+ movq %rdi,%r10
+ shrq $2,%rdi
+ andq $3,%r10
+ addq %rdi,%rax
addq %rax,%r14
adcq $0,%rbx
- adcq $0,%rbp
- .byte 0xf3,0xc3
-.size __poly1305_block,.-__poly1305_block
+ adcq $0,%r10
+.endm
-.type __poly1305_init_avx,@function
-.align 32
-__poly1305_init_avx:
+.macro __poly1305_init_avx
movq %r11,%r14
movq %r12,%rbx
- xorq %rbp,%rbp
+ xorq %r10,%r10
leaq 48+64(%rdi),%rdi
movq %r12,%rax
- call __poly1305_block
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
movl $0x3ffffff,%eax
movl $0x3ffffff,%edx
@@ -305,7 +271,7 @@ __poly1305_init_avx:
movl %edx,36(%rdi)
shrq $26,%r9
- movq %rbp,%rax
+ movq %r10,%rax
shlq $24,%rax
orq %rax,%r8
movl %r8d,48(%rdi)
@@ -316,7 +282,9 @@ __poly1305_init_avx:
movl %r9d,68(%rdi)
movq %r12,%rax
- call __poly1305_block
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
movl $0x3ffffff,%eax
movq %r14,%r8
@@ -348,7 +316,7 @@ __poly1305_init_avx:
shrq $26,%r8
movl %edx,44(%rdi)
- movq %rbp,%rax
+ movq %r10,%rax
shlq $24,%rax
orq %rax,%r8
movl %r8d,60(%rdi)
@@ -356,7 +324,9 @@ __poly1305_init_avx:
movl %r8d,76(%rdi)
movq %r12,%rax
- call __poly1305_block
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
movl $0x3ffffff,%eax
movq %r14,%r8
@@ -388,7 +358,7 @@ __poly1305_init_avx:
shrq $26,%r8
movl %edx,40(%rdi)
- movq %rbp,%rax
+ movq %r10,%rax
shlq $24,%rax
orq %rax,%r8
movl %r8d,56(%rdi)
@@ -396,13 +366,12 @@ __poly1305_init_avx:
movl %r8d,72(%rdi)
leaq -48-64(%rdi),%rdi
- .byte 0xf3,0xc3
-.size __poly1305_init_avx,.-__poly1305_init_avx
+.endm
-.type poly1305_blocks_avx,@function
+#ifdef CONFIG_AS_AVX
.align 32
-poly1305_blocks_avx:
-.cfi_startproc
+ENTRY(poly1305_blocks_avx)
+
movl 20(%rdi),%r8d
cmpq $128,%rdx
jae .Lblocks_avx
@@ -422,30 +391,19 @@ poly1305_blocks_avx:
jz .Leven_avx
pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-24
pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
+ pushq %rdi
+
.Lblocks_avx_body:
movq %rdx,%r15
movq 0(%rdi),%r8
movq 8(%rdi),%r9
- movl 16(%rdi),%ebp
+ movl 16(%rdi),%r10d
movq 24(%rdi),%r11
movq 32(%rdi),%r13
@@ -465,21 +423,21 @@ poly1305_blocks_avx:
addq %r12,%r14
adcq %r9,%rbx
- movq %rbp,%r8
+ movq %r10,%r8
shlq $40,%r8
- shrq $24,%rbp
+ shrq $24,%r10
addq %r8,%rbx
- adcq $0,%rbp
+ adcq $0,%r10
movq $-4,%r9
- movq %rbp,%r8
- andq %rbp,%r9
+ movq %r10,%r8
+ andq %r10,%r9
shrq $2,%r8
- andq $3,%rbp
+ andq $3,%r10
addq %r9,%r8
addq %r8,%r14
adcq $0,%rbx
- adcq $0,%rbp
+ adcq $0,%r10
movq %r13,%r12
movq %r13,%rax
@@ -489,9 +447,11 @@ poly1305_blocks_avx:
addq 0(%rsi),%r14
adcq 8(%rsi),%rbx
leaq 16(%rsi),%rsi
- adcq %rcx,%rbp
+ adcq %rcx,%r10
- call __poly1305_block
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
testq %rcx,%rcx
jz .Lstore_base2_64_avx
@@ -508,11 +468,11 @@ poly1305_blocks_avx:
andq $0x3ffffff,%rdx
shrq $14,%rbx
orq %r11,%r14
- shlq $24,%rbp
+ shlq $24,%r10
andq $0x3ffffff,%r14
shrq $40,%r12
andq $0x3ffffff,%rbx
- orq %r12,%rbp
+ orq %r12,%r10
subq $16,%r15
jz .Lstore_base2_26_avx
@@ -521,14 +481,14 @@ poly1305_blocks_avx:
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
- vmovd %ebp,%xmm4
+ vmovd %r10d,%xmm4
jmp .Lproceed_avx
.align 32
.Lstore_base2_64_avx:
movq %r14,0(%rdi)
movq %rbx,8(%rdi)
- movq %rbp,16(%rdi)
+ movq %r10,16(%rdi)
jmp .Ldone_avx
.align 16
@@ -537,49 +497,30 @@ poly1305_blocks_avx:
movl %edx,4(%rdi)
movl %r14d,8(%rdi)
movl %ebx,12(%rdi)
- movl %ebp,16(%rdi)
+ movl %r10d,16(%rdi)
.align 16
.Ldone_avx:
- movq 0(%rsp),%r15
-.cfi_restore %r15
- movq 8(%rsp),%r14
-.cfi_restore %r14
- movq 16(%rsp),%r13
-.cfi_restore %r13
- movq 24(%rsp),%r12
-.cfi_restore %r12
- movq 32(%rsp),%rbp
-.cfi_restore %rbp
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
movq 40(%rsp),%rbx
-.cfi_restore %rbx
leaq 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
+
.Lno_data_avx:
.Lblocks_avx_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
+ ret
.align 32
.Lbase2_64_avx:
-.cfi_startproc
+
pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-24
pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
+ pushq %rdi
+
.Lbase2_64_avx_body:
movq %rdx,%r15
@@ -589,7 +530,7 @@ poly1305_blocks_avx:
movq 0(%rdi),%r14
movq 8(%rdi),%rbx
- movl 16(%rdi),%ebp
+ movl 16(%rdi),%r10d
movq %r13,%r12
movq %r13,%rax
@@ -602,10 +543,12 @@ poly1305_blocks_avx:
addq 0(%rsi),%r14
adcq 8(%rsi),%rbx
leaq 16(%rsi),%rsi
- adcq %rcx,%rbp
+ adcq %rcx,%r10
subq $16,%r15
- call __poly1305_block
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
.Linit_avx:
@@ -620,46 +563,38 @@ poly1305_blocks_avx:
andq $0x3ffffff,%rdx
shrq $14,%rbx
orq %r8,%r14
- shlq $24,%rbp
+ shlq $24,%r10
andq $0x3ffffff,%r14
shrq $40,%r9
andq $0x3ffffff,%rbx
- orq %r9,%rbp
+ orq %r9,%r10
vmovd %eax,%xmm0
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
- vmovd %ebp,%xmm4
+ vmovd %r10d,%xmm4
movl $1,20(%rdi)
- call __poly1305_init_avx
+ __poly1305_init_avx
.Lproceed_avx:
movq %r15,%rdx
- movq 0(%rsp),%r15
-.cfi_restore %r15
- movq 8(%rsp),%r14
-.cfi_restore %r14
- movq 16(%rsp),%r13
-.cfi_restore %r13
- movq 24(%rsp),%r12
-.cfi_restore %r12
- movq 32(%rsp),%rbp
-.cfi_restore %rbp
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
movq 40(%rsp),%rbx
-.cfi_restore %rbx
leaq 48(%rsp),%rax
leaq 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
+
.Lbase2_64_avx_epilogue:
jmp .Ldo_avx
-.cfi_endproc
+
.align 32
.Leven_avx:
-.cfi_startproc
vmovd 0(%rdi),%xmm0
vmovd 4(%rdi),%xmm1
vmovd 8(%rdi),%xmm2
@@ -667,8 +602,10 @@ poly1305_blocks_avx:
vmovd 16(%rdi),%xmm4
.Ldo_avx:
+ leaq 8(%rsp),%r10
+ andq $-32,%rsp
+ subq $8,%rsp
leaq -88(%rsp),%r11
-.cfi_def_cfa %r11,0x60
subq $0x178,%rsp
subq $64,%rdx
leaq -32(%rsi),%rax
@@ -678,8 +615,6 @@ poly1305_blocks_avx:
leaq 112(%rdi),%rdi
leaq .Lconst(%rip),%rcx
-
-
vmovdqu 32(%rsi),%xmm5
vmovdqu 48(%rsi),%xmm6
vmovdqa 64(%rcx),%xmm15
@@ -754,25 +689,6 @@ poly1305_blocks_avx:
.align 32
.Loop_avx:
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
vpmuludq %xmm5,%xmm14,%xmm10
vpmuludq %xmm6,%xmm14,%xmm11
vmovdqa %xmm2,32(%r11)
@@ -866,15 +782,6 @@ poly1305_blocks_avx:
subq $64,%rdx
cmovcq %rax,%rsi
-
-
-
-
-
-
-
-
-
vpmuludq %xmm0,%xmm9,%xmm5
vpmuludq %xmm1,%xmm9,%xmm6
vpaddq %xmm5,%xmm10,%xmm10
@@ -957,10 +864,6 @@ poly1305_blocks_avx:
vpand %xmm15,%xmm8,%xmm8
vpor 32(%rcx),%xmm9,%xmm9
-
-
-
-
vpsrlq $26,%xmm3,%xmm13
vpand %xmm15,%xmm3,%xmm3
vpaddq %xmm13,%xmm4,%xmm4
@@ -995,9 +898,6 @@ poly1305_blocks_avx:
ja .Loop_avx
.Lskip_loop_avx:
-
-
-
vpshufd $0x10,%xmm14,%xmm14
addq $32,%rdx
jnz .Long_tail_avx
@@ -1015,12 +915,6 @@ poly1305_blocks_avx:
vmovdqa %xmm3,48(%r11)
vmovdqa %xmm4,64(%r11)
-
-
-
-
-
-
vpmuludq %xmm7,%xmm14,%xmm12
vpmuludq %xmm5,%xmm14,%xmm10
vpshufd $0x10,-48(%rdi),%xmm2
@@ -1107,9 +1001,6 @@ poly1305_blocks_avx:
vpaddq 48(%r11),%xmm3,%xmm3
vpaddq 64(%r11),%xmm4,%xmm4
-
-
-
vpmuludq %xmm0,%xmm9,%xmm5
vpaddq %xmm5,%xmm10,%xmm10
vpmuludq %xmm1,%xmm9,%xmm6
@@ -1175,8 +1066,6 @@ poly1305_blocks_avx:
.Lshort_tail_avx:
-
-
vpsrldq $8,%xmm14,%xmm9
vpsrldq $8,%xmm13,%xmm8
vpsrldq $8,%xmm11,%xmm6
@@ -1188,9 +1077,6 @@ poly1305_blocks_avx:
vpaddq %xmm6,%xmm11,%xmm11
vpaddq %xmm7,%xmm12,%xmm12
-
-
-
vpsrlq $26,%xmm13,%xmm3
vpand %xmm15,%xmm13,%xmm13
vpaddq %xmm3,%xmm14,%xmm14
@@ -1227,16 +1113,14 @@ poly1305_blocks_avx:
vmovd %xmm12,-104(%rdi)
vmovd %xmm13,-100(%rdi)
vmovd %xmm14,-96(%rdi)
- leaq 88(%r11),%rsp
-.cfi_def_cfa %rsp,8
+ leaq -8(%r10),%rsp
+
vzeroupper
- .byte 0xf3,0xc3
-.cfi_endproc
-.size poly1305_blocks_avx,.-poly1305_blocks_avx
+ ret
+ENDPROC(poly1305_blocks_avx)
-.type poly1305_emit_avx,@function
.align 32
-poly1305_emit_avx:
+ENTRY(poly1305_emit_avx)
cmpl $0,20(%rdi)
je .Lemit
@@ -1286,12 +1170,14 @@ poly1305_emit_avx:
movq %rax,0(%rsi)
movq %rcx,8(%rsi)
- .byte 0xf3,0xc3
-.size poly1305_emit_avx,.-poly1305_emit_avx
-.type poly1305_blocks_avx2,@function
+ ret
+ENDPROC(poly1305_emit_avx)
+#endif /* CONFIG_AS_AVX */
+
+#ifdef CONFIG_AS_AVX2
.align 32
-poly1305_blocks_avx2:
-.cfi_startproc
+ENTRY(poly1305_blocks_avx2)
+
movl 20(%rdi),%r8d
cmpq $128,%rdx
jae .Lblocks_avx2
@@ -1311,30 +1197,19 @@ poly1305_blocks_avx2:
jz .Leven_avx2
pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-24
pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
+ pushq %rdi
+
.Lblocks_avx2_body:
movq %rdx,%r15
movq 0(%rdi),%r8
movq 8(%rdi),%r9
- movl 16(%rdi),%ebp
+ movl 16(%rdi),%r10d
movq 24(%rdi),%r11
movq 32(%rdi),%r13
@@ -1354,21 +1229,21 @@ poly1305_blocks_avx2:
addq %r12,%r14
adcq %r9,%rbx
- movq %rbp,%r8
+ movq %r10,%r8
shlq $40,%r8
- shrq $24,%rbp
+ shrq $24,%r10
addq %r8,%rbx
- adcq $0,%rbp
+ adcq $0,%r10
movq $-4,%r9
- movq %rbp,%r8
- andq %rbp,%r9
+ movq %r10,%r8
+ andq %r10,%r9
shrq $2,%r8
- andq $3,%rbp
+ andq $3,%r10
addq %r9,%r8
addq %r8,%r14
adcq $0,%rbx
- adcq $0,%rbp
+ adcq $0,%r10
movq %r13,%r12
movq %r13,%rax
@@ -1379,10 +1254,12 @@ poly1305_blocks_avx2:
addq 0(%rsi),%r14
adcq 8(%rsi),%rbx
leaq 16(%rsi),%rsi
- adcq %rcx,%rbp
+ adcq %rcx,%r10
subq $16,%r15
- call __poly1305_block
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
movq %r12,%rax
testq $63,%r15
@@ -1403,11 +1280,11 @@ poly1305_blocks_avx2:
andq $0x3ffffff,%rdx
shrq $14,%rbx
orq %r11,%r14
- shlq $24,%rbp
+ shlq $24,%r10
andq $0x3ffffff,%r14
shrq $40,%r12
andq $0x3ffffff,%rbx
- orq %r12,%rbp
+ orq %r12,%r10
testq %r15,%r15
jz .Lstore_base2_26_avx2
@@ -1416,14 +1293,14 @@ poly1305_blocks_avx2:
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
- vmovd %ebp,%xmm4
+ vmovd %r10d,%xmm4
jmp .Lproceed_avx2
.align 32
.Lstore_base2_64_avx2:
movq %r14,0(%rdi)
movq %rbx,8(%rdi)
- movq %rbp,16(%rdi)
+ movq %r10,16(%rdi)
jmp .Ldone_avx2
.align 16
@@ -1432,49 +1309,32 @@ poly1305_blocks_avx2:
movl %edx,4(%rdi)
movl %r14d,8(%rdi)
movl %ebx,12(%rdi)
- movl %ebp,16(%rdi)
+ movl %r10d,16(%rdi)
.align 16
.Ldone_avx2:
- movq 0(%rsp),%r15
-.cfi_restore %r15
- movq 8(%rsp),%r14
-.cfi_restore %r14
- movq 16(%rsp),%r13
-.cfi_restore %r13
- movq 24(%rsp),%r12
-.cfi_restore %r12
- movq 32(%rsp),%rbp
-.cfi_restore %rbp
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
movq 40(%rsp),%rbx
-.cfi_restore %rbx
leaq 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
+
.Lno_data_avx2:
.Lblocks_avx2_epilogue:
- .byte 0xf3,0xc3
-.cfi_endproc
+ ret
+
.align 32
.Lbase2_64_avx2:
-.cfi_startproc
+
+
pushq %rbx
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbx,-16
- pushq %rbp
-.cfi_adjust_cfa_offset 8
-.cfi_offset %rbp,-24
pushq %r12
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r12,-32
pushq %r13
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r13,-40
pushq %r14
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r14,-48
pushq %r15
-.cfi_adjust_cfa_offset 8
-.cfi_offset %r15,-56
+ pushq %rdi
+
.Lbase2_64_avx2_body:
movq %rdx,%r15
@@ -1484,7 +1344,7 @@ poly1305_blocks_avx2:
movq 0(%rdi),%r14
movq 8(%rdi),%rbx
- movl 16(%rdi),%ebp
+ movl 16(%rdi),%r10d
movq %r13,%r12
movq %r13,%rax
@@ -1498,10 +1358,12 @@ poly1305_blocks_avx2:
addq 0(%rsi),%r14
adcq 8(%rsi),%rbx
leaq 16(%rsi),%rsi
- adcq %rcx,%rbp
+ adcq %rcx,%r10
subq $16,%r15
- call __poly1305_block
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
movq %r12,%rax
testq $63,%r15
@@ -1520,49 +1382,39 @@ poly1305_blocks_avx2:
andq $0x3ffffff,%rdx
shrq $14,%rbx
orq %r8,%r14
- shlq $24,%rbp
+ shlq $24,%r10
andq $0x3ffffff,%r14
shrq $40,%r9
andq $0x3ffffff,%rbx
- orq %r9,%rbp
+ orq %r9,%r10
vmovd %eax,%xmm0
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
- vmovd %ebp,%xmm4
+ vmovd %r10d,%xmm4
movl $1,20(%rdi)
- call __poly1305_init_avx
+ __poly1305_init_avx
.Lproceed_avx2:
movq %r15,%rdx
- movl OPENSSL_ia32cap_P+8(%rip),%r10d
- movl $3221291008,%r11d
-
- movq 0(%rsp),%r15
-.cfi_restore %r15
- movq 8(%rsp),%r14
-.cfi_restore %r14
- movq 16(%rsp),%r13
-.cfi_restore %r13
- movq 24(%rsp),%r12
-.cfi_restore %r12
- movq 32(%rsp),%rbp
-.cfi_restore %rbp
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
movq 40(%rsp),%rbx
-.cfi_restore %rbx
leaq 48(%rsp),%rax
leaq 48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
+
.Lbase2_64_avx2_epilogue:
jmp .Ldo_avx2
-.cfi_endproc
+
.align 32
.Leven_avx2:
-.cfi_startproc
- movl OPENSSL_ia32cap_P+8(%rip),%r10d
+
vmovd 0(%rdi),%xmm0
vmovd 4(%rdi),%xmm1
vmovd 8(%rdi),%xmm2
@@ -1570,14 +1422,7 @@ poly1305_blocks_avx2:
vmovd 16(%rdi),%xmm4
.Ldo_avx2:
- cmpq $512,%rdx
- jb .Lskip_avx512
- andl %r11d,%r10d
- testl $65536,%r10d
- jnz .Lblocks_avx512
-.Lskip_avx512:
- leaq -8(%rsp),%r11
-.cfi_def_cfa %r11,16
+ leaq 8(%rsp),%r10
subq $0x128,%rsp
leaq .Lconst(%rip),%rcx
leaq 48+64(%rdi),%rdi
@@ -1647,13 +1492,6 @@ poly1305_blocks_avx2:
.align 32
.Loop_avx2:
-
-
-
-
-
-
-
vpaddq %ymm0,%ymm7,%ymm0
vmovdqa 0(%rsp),%ymm7
vpaddq %ymm1,%ymm8,%ymm1
@@ -1664,21 +1502,6 @@ poly1305_blocks_avx2:
vmovdqa 48(%rax),%ymm10
vmovdqa 112(%rax),%ymm5
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
vpmuludq %ymm2,%ymm7,%ymm13
vpmuludq %ymm2,%ymm8,%ymm14
vpmuludq %ymm2,%ymm9,%ymm15
@@ -1743,9 +1566,6 @@ poly1305_blocks_avx2:
vpaddq %ymm4,%ymm15,%ymm4
vpaddq %ymm0,%ymm11,%ymm0
-
-
-
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4
@@ -1798,12 +1618,6 @@ poly1305_blocks_avx2:
.byte 0x66,0x90
.Ltail_avx2:
-
-
-
-
-
-
vpaddq %ymm0,%ymm7,%ymm0
vmovdqu 4(%rsp),%ymm7
vpaddq %ymm1,%ymm8,%ymm1
@@ -1868,9 +1682,6 @@ poly1305_blocks_avx2:
vpaddq %ymm4,%ymm15,%ymm4
vpaddq %ymm0,%ymm11,%ymm0
-
-
-
vpsrldq $8,%ymm12,%ymm8
vpsrldq $8,%ymm2,%ymm9
vpsrldq $8,%ymm3,%ymm10
@@ -1893,9 +1704,6 @@ poly1305_blocks_avx2:
vpaddq %ymm8,%ymm12,%ymm12
vpaddq %ymm9,%ymm2,%ymm2
-
-
-
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4
@@ -1932,110 +1740,673 @@ poly1305_blocks_avx2:
vmovd %xmm2,-104(%rdi)
vmovd %xmm3,-100(%rdi)
vmovd %xmm4,-96(%rdi)
- leaq 8(%r11),%rsp
-.cfi_def_cfa %rsp,8
+ leaq -8(%r10),%rsp
+
vzeroupper
- .byte 0xf3,0xc3
-.cfi_endproc
-.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
-.type poly1305_blocks_avx512,@function
+ ret
+
+ENDPROC(poly1305_blocks_avx2)
+#endif /* CONFIG_AS_AVX2 */
+
+#ifdef CONFIG_AS_AVX512
.align 32
-poly1305_blocks_avx512:
-.cfi_startproc
-.Lblocks_avx512:
- movl $15,%eax
- kmovw %eax,%k2
- leaq -8(%rsp),%r11
-.cfi_def_cfa %r11,16
- subq $0x128,%rsp
- leaq .Lconst(%rip),%rcx
- leaq 48+64(%rdi),%rdi
- vmovdqa 96(%rcx),%ymm9
+ENTRY(poly1305_blocks_avx512)
+ movl 20(%rdi),%r8d
+ cmpq $128,%rdx
+ jae .Lblocks_avx2_512
+ testl %r8d,%r8d
+ jz .Lblocks
- vmovdqu -64(%rdi),%xmm11
- andq $-512,%rsp
- vmovdqu -48(%rdi),%xmm12
- movq $0x20,%rax
- vmovdqu -32(%rdi),%xmm7
- vmovdqu -16(%rdi),%xmm13
- vmovdqu 0(%rdi),%xmm8
- vmovdqu 16(%rdi),%xmm14
- vmovdqu 32(%rdi),%xmm10
- vmovdqu 48(%rdi),%xmm15
- vmovdqu 64(%rdi),%xmm6
- vpermd %zmm11,%zmm9,%zmm16
- vpbroadcastq 64(%rcx),%zmm5
- vpermd %zmm12,%zmm9,%zmm17
- vpermd %zmm7,%zmm9,%zmm21
- vpermd %zmm13,%zmm9,%zmm18
- vmovdqa64 %zmm16,0(%rsp){%k2}
- vpsrlq $32,%zmm16,%zmm7
- vpermd %zmm8,%zmm9,%zmm22
- vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2}
- vpsrlq $32,%zmm17,%zmm8
- vpermd %zmm14,%zmm9,%zmm19
- vmovdqa64 %zmm21,64(%rsp){%k2}
- vpermd %zmm10,%zmm9,%zmm23
- vpermd %zmm15,%zmm9,%zmm20
- vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2}
- vpermd %zmm6,%zmm9,%zmm24
- vmovdqa64 %zmm22,128(%rsp){%k2}
- vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2}
- vmovdqa64 %zmm23,192(%rsp){%k2}
- vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2}
- vmovdqa64 %zmm24,256(%rsp){%k2}
+.Lblocks_avx2_512:
+ andq $-16,%rdx
+ jz .Lno_data_avx2_512
+ vzeroupper
+ testl %r8d,%r8d
+ jz .Lbase2_64_avx2_512
+ testq $63,%rdx
+ jz .Leven_avx2_512
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+.Lblocks_avx2_body_512:
+ movq %rdx,%r15
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movl 16(%rdi),%r10d
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
- vpmuludq %zmm7,%zmm16,%zmm11
- vpmuludq %zmm7,%zmm17,%zmm12
- vpmuludq %zmm7,%zmm18,%zmm13
- vpmuludq %zmm7,%zmm19,%zmm14
- vpmuludq %zmm7,%zmm20,%zmm15
- vpsrlq $32,%zmm18,%zmm9
+ movl %r8d,%r14d
+ andq $-2147483648,%r8
+ movq %r9,%r12
+ movl %r9d,%ebx
+ andq $-2147483648,%r9
- vpmuludq %zmm8,%zmm24,%zmm25
- vpmuludq %zmm8,%zmm16,%zmm26
- vpmuludq %zmm8,%zmm17,%zmm27
- vpmuludq %zmm8,%zmm18,%zmm28
- vpmuludq %zmm8,%zmm19,%zmm29
- vpsrlq $32,%zmm19,%zmm10
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm26,%zmm12,%zmm12
- vpaddq %zmm27,%zmm13,%zmm13
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
+ shrq $6,%r8
+ shlq $52,%r12
+ addq %r8,%r14
+ shrq $12,%rbx
+ shrq $18,%r9
+ addq %r12,%r14
+ adcq %r9,%rbx
- vpmuludq %zmm9,%zmm23,%zmm25
- vpmuludq %zmm9,%zmm24,%zmm26
- vpmuludq %zmm9,%zmm17,%zmm28
- vpmuludq %zmm9,%zmm18,%zmm29
- vpmuludq %zmm9,%zmm16,%zmm27
- vpsrlq $32,%zmm20,%zmm6
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm26,%zmm12,%zmm12
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
- vpaddq %zmm27,%zmm13,%zmm13
+ movq %r10,%r8
+ shlq $40,%r8
+ shrq $24,%r10
+ addq %r8,%rbx
+ adcq $0,%r10
- vpmuludq %zmm10,%zmm22,%zmm25
- vpmuludq %zmm10,%zmm16,%zmm28
- vpmuludq %zmm10,%zmm17,%zmm29
- vpmuludq %zmm10,%zmm23,%zmm26
- vpmuludq %zmm10,%zmm24,%zmm27
- vpaddq %zmm25,%zmm11,%zmm11
- vpaddq %zmm28,%zmm14,%zmm14
- vpaddq %zmm29,%zmm15,%zmm15
- vpaddq %zmm26,%zmm12,%zmm12
- vpaddq %zmm27,%zmm13,%zmm13
+ movq $-4,%r9
+ movq %r10,%r8
+ andq %r10,%r9
+ shrq $2,%r8
+ andq $3,%r10
+ addq %r9,%r8
+ addq %r8,%r14
+ adcq $0,%rbx
+ adcq $0,%r10
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+.Lbase2_26_pre_avx2_512:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ subq $16,%r15
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_26_pre_avx2_512
+
+ testq %rcx,%rcx
+ jz .Lstore_base2_64_avx2_512
+
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r11
+ movq %rbx,%r12
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r11
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r11,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r12
+ andq $0x3ffffff,%rbx
+ orq %r12,%r10
+
+ testq %r15,%r15
+ jz .Lstore_base2_26_avx2_512
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ jmp .Lproceed_avx2_512
+
+.align 32
+.Lstore_base2_64_avx2_512:
+ movq %r14,0(%rdi)
+ movq %rbx,8(%rdi)
+ movq %r10,16(%rdi)
+ jmp .Ldone_avx2_512
+
+.align 16
+.Lstore_base2_26_avx2_512:
+ movl %eax,0(%rdi)
+ movl %edx,4(%rdi)
+ movl %r14d,8(%rdi)
+ movl %ebx,12(%rdi)
+ movl %r10d,16(%rdi)
+.align 16
+.Ldone_avx2_512:
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rsp
+
+.Lno_data_avx2_512:
+.Lblocks_avx2_epilogue_512:
+ ret
+
+
+.align 32
+.Lbase2_64_avx2_512:
+
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushq %rdi
+
+.Lbase2_64_avx2_body_512:
+
+ movq %rdx,%r15
+
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r13
+
+ movq 0(%rdi),%r14
+ movq 8(%rdi),%rbx
+ movl 16(%rdi),%r10d
+
+ movq %r13,%r12
+ movq %r13,%rax
+ shrq $2,%r13
+ addq %r12,%r13
+
+ testq $63,%rdx
+ jz .Linit_avx2_512
+
+.Lbase2_64_pre_avx2_512:
+ addq 0(%rsi),%r14
+ adcq 8(%rsi),%rbx
+ leaq 16(%rsi),%rsi
+ adcq %rcx,%r10
+ subq $16,%r15
+
+ movq %rdi,0(%rsp)
+ __poly1305_block
+ movq 0(%rsp),%rdi
+ movq %r12,%rax
+
+ testq $63,%r15
+ jnz .Lbase2_64_pre_avx2_512
+
+.Linit_avx2_512:
+
+ movq %r14,%rax
+ movq %r14,%rdx
+ shrq $52,%r14
+ movq %rbx,%r8
+ movq %rbx,%r9
+ shrq $26,%rdx
+ andq $0x3ffffff,%rax
+ shlq $12,%r8
+ andq $0x3ffffff,%rdx
+ shrq $14,%rbx
+ orq %r8,%r14
+ shlq $24,%r10
+ andq $0x3ffffff,%r14
+ shrq $40,%r9
+ andq $0x3ffffff,%rbx
+ orq %r9,%r10
+
+ vmovd %eax,%xmm0
+ vmovd %edx,%xmm1
+ vmovd %r14d,%xmm2
+ vmovd %ebx,%xmm3
+ vmovd %r10d,%xmm4
+ movl $1,20(%rdi)
+
+ __poly1305_init_avx
+
+.Lproceed_avx2_512:
+ movq %r15,%rdx
+
+ movq 8(%rsp),%r15
+ movq 16(%rsp),%r14
+ movq 24(%rsp),%r13
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%rbx
+ leaq 48(%rsp),%rax
+ leaq 48(%rsp),%rsp
+
+.Lbase2_64_avx2_epilogue_512:
+ jmp .Ldo_avx2_512
+
+
+.align 32
+.Leven_avx2_512:
+
+ vmovd 0(%rdi),%xmm0
+ vmovd 4(%rdi),%xmm1
+ vmovd 8(%rdi),%xmm2
+ vmovd 12(%rdi),%xmm3
+ vmovd 16(%rdi),%xmm4
+
+.Ldo_avx2_512:
+ cmpq $512,%rdx
+ jae .Lblocks_avx512
+.Lskip_avx512:
+ leaq 8(%rsp),%r10
+
+ subq $0x128,%rsp
+ leaq .Lconst(%rip),%rcx
+ leaq 48+64(%rdi),%rdi
+ vmovdqa 96(%rcx),%ymm7
+
+
+ vmovdqu -64(%rdi),%xmm9
+ andq $-512,%rsp
+ vmovdqu -48(%rdi),%xmm10
+ vmovdqu -32(%rdi),%xmm6
+ vmovdqu -16(%rdi),%xmm11
+ vmovdqu 0(%rdi),%xmm12
+ vmovdqu 16(%rdi),%xmm13
+ leaq 144(%rsp),%rax
+ vmovdqu 32(%rdi),%xmm14
+ vpermd %ymm9,%ymm7,%ymm9
+ vmovdqu 48(%rdi),%xmm15
+ vpermd %ymm10,%ymm7,%ymm10
+ vmovdqu 64(%rdi),%xmm5
+ vpermd %ymm6,%ymm7,%ymm6
+ vmovdqa %ymm9,0(%rsp)
+ vpermd %ymm11,%ymm7,%ymm11
+ vmovdqa %ymm10,32-144(%rax)
+ vpermd %ymm12,%ymm7,%ymm12
+ vmovdqa %ymm6,64-144(%rax)
+ vpermd %ymm13,%ymm7,%ymm13
+ vmovdqa %ymm11,96-144(%rax)
+ vpermd %ymm14,%ymm7,%ymm14
+ vmovdqa %ymm12,128-144(%rax)
+ vpermd %ymm15,%ymm7,%ymm15
+ vmovdqa %ymm13,160-144(%rax)
+ vpermd %ymm5,%ymm7,%ymm5
+ vmovdqa %ymm14,192-144(%rax)
+ vmovdqa %ymm15,224-144(%rax)
+ vmovdqa %ymm5,256-144(%rax)
+ vmovdqa 64(%rcx),%ymm5
+
+
+
+ vmovdqu 0(%rsi),%xmm7
+ vmovdqu 16(%rsi),%xmm8
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpsrldq $6,%ymm7,%ymm9
+ vpsrldq $6,%ymm8,%ymm10
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+ vpunpcklqdq %ymm10,%ymm9,%ymm9
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+
+ vpsrlq $30,%ymm9,%ymm10
+ vpsrlq $4,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+ vpsrlq $40,%ymm6,%ymm6
+ vpand %ymm5,%ymm9,%ymm9
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ vpaddq %ymm2,%ymm9,%ymm2
+ subq $64,%rdx
+ jz .Ltail_avx2_512
+ jmp .Loop_avx2_512
+
+.align 32
+.Loop_avx2_512:
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqa 0(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqa 32(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqa 96(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqa 48(%rax),%ymm10
+ vmovdqa 112(%rax),%ymm5
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 64(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+ vmovdqa -16(%rax),%ymm8
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vmovdqu 0(%rsi),%xmm7
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+ vinserti128 $1,32(%rsi),%ymm7,%ymm7
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vmovdqu 16(%rsi),%xmm8
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqa 16(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+ vinserti128 $1,48(%rsi),%ymm8,%ymm8
+ leaq 64(%rsi),%rsi
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpsrldq $6,%ymm7,%ymm9
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpsrldq $6,%ymm8,%ymm10
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpunpckhqdq %ymm8,%ymm7,%ymm6
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpunpcklqdq %ymm8,%ymm7,%ymm7
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpunpcklqdq %ymm10,%ymm9,%ymm10
+ vpmuludq 80(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $4,%ymm10,%ymm9
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpand %ymm5,%ymm9,%ymm9
+ vpsrlq $26,%ymm7,%ymm8
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpsrlq $30,%ymm10,%ymm10
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $40,%ymm6,%ymm6
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpand %ymm5,%ymm7,%ymm7
+ vpand %ymm5,%ymm8,%ymm8
+ vpand %ymm5,%ymm10,%ymm10
+ vpor 32(%rcx),%ymm6,%ymm6
+
+ subq $64,%rdx
+ jnz .Loop_avx2_512
+
+.byte 0x66,0x90
+.Ltail_avx2_512:
+
+ vpaddq %ymm0,%ymm7,%ymm0
+ vmovdqu 4(%rsp),%ymm7
+ vpaddq %ymm1,%ymm8,%ymm1
+ vmovdqu 36(%rsp),%ymm8
+ vpaddq %ymm3,%ymm10,%ymm3
+ vmovdqu 100(%rsp),%ymm9
+ vpaddq %ymm4,%ymm6,%ymm4
+ vmovdqu 52(%rax),%ymm10
+ vmovdqu 116(%rax),%ymm5
+
+ vpmuludq %ymm2,%ymm7,%ymm13
+ vpmuludq %ymm2,%ymm8,%ymm14
+ vpmuludq %ymm2,%ymm9,%ymm15
+ vpmuludq %ymm2,%ymm10,%ymm11
+ vpmuludq %ymm2,%ymm5,%ymm12
+
+ vpmuludq %ymm0,%ymm8,%ymm6
+ vpmuludq %ymm1,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq 68(%rsp),%ymm4,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm11,%ymm11
+
+ vpmuludq %ymm0,%ymm7,%ymm6
+ vpmuludq %ymm1,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vmovdqu -12(%rax),%ymm8
+ vpaddq %ymm2,%ymm12,%ymm12
+ vpmuludq %ymm3,%ymm7,%ymm6
+ vpmuludq %ymm4,%ymm7,%ymm2
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm2,%ymm15,%ymm15
+
+ vpmuludq %ymm3,%ymm8,%ymm6
+ vpmuludq %ymm4,%ymm8,%ymm2
+ vpaddq %ymm6,%ymm11,%ymm11
+ vpaddq %ymm2,%ymm12,%ymm12
+ vmovdqu 20(%rax),%ymm2
+ vpmuludq %ymm1,%ymm9,%ymm6
+ vpmuludq %ymm0,%ymm9,%ymm9
+ vpaddq %ymm6,%ymm14,%ymm14
+ vpaddq %ymm9,%ymm13,%ymm13
+
+ vpmuludq %ymm1,%ymm2,%ymm6
+ vpmuludq %ymm0,%ymm2,%ymm2
+ vpaddq %ymm6,%ymm15,%ymm15
+ vpaddq %ymm2,%ymm14,%ymm14
+ vpmuludq %ymm3,%ymm10,%ymm6
+ vpmuludq %ymm4,%ymm10,%ymm2
+ vpaddq %ymm6,%ymm12,%ymm12
+ vpaddq %ymm2,%ymm13,%ymm13
+
+ vpmuludq %ymm3,%ymm5,%ymm3
+ vpmuludq %ymm4,%ymm5,%ymm4
+ vpaddq %ymm3,%ymm13,%ymm2
+ vpaddq %ymm4,%ymm14,%ymm3
+ vpmuludq 84(%rax),%ymm0,%ymm4
+ vpmuludq %ymm1,%ymm5,%ymm0
+ vmovdqa 64(%rcx),%ymm5
+ vpaddq %ymm4,%ymm15,%ymm4
+ vpaddq %ymm0,%ymm11,%ymm0
+
+ vpsrldq $8,%ymm12,%ymm8
+ vpsrldq $8,%ymm2,%ymm9
+ vpsrldq $8,%ymm3,%ymm10
+ vpsrldq $8,%ymm4,%ymm6
+ vpsrldq $8,%ymm0,%ymm7
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+
+ vpermq $0x2,%ymm3,%ymm10
+ vpermq $0x2,%ymm4,%ymm6
+ vpermq $0x2,%ymm0,%ymm7
+ vpermq $0x2,%ymm12,%ymm8
+ vpermq $0x2,%ymm2,%ymm9
+ vpaddq %ymm10,%ymm3,%ymm3
+ vpaddq %ymm6,%ymm4,%ymm4
+ vpaddq %ymm7,%ymm0,%ymm0
+ vpaddq %ymm8,%ymm12,%ymm12
+ vpaddq %ymm9,%ymm2,%ymm2
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm12,%ymm1
+
+ vpsrlq $26,%ymm4,%ymm15
+ vpand %ymm5,%ymm4,%ymm4
+
+ vpsrlq $26,%ymm1,%ymm12
+ vpand %ymm5,%ymm1,%ymm1
+ vpaddq %ymm12,%ymm2,%ymm2
+
+ vpaddq %ymm15,%ymm0,%ymm0
+ vpsllq $2,%ymm15,%ymm15
+ vpaddq %ymm15,%ymm0,%ymm0
+
+ vpsrlq $26,%ymm2,%ymm13
+ vpand %ymm5,%ymm2,%ymm2
+ vpaddq %ymm13,%ymm3,%ymm3
+
+ vpsrlq $26,%ymm0,%ymm11
+ vpand %ymm5,%ymm0,%ymm0
+ vpaddq %ymm11,%ymm1,%ymm1
+
+ vpsrlq $26,%ymm3,%ymm14
+ vpand %ymm5,%ymm3,%ymm3
+ vpaddq %ymm14,%ymm4,%ymm4
+
+ vmovd %xmm0,-112(%rdi)
+ vmovd %xmm1,-108(%rdi)
+ vmovd %xmm2,-104(%rdi)
+ vmovd %xmm3,-100(%rdi)
+ vmovd %xmm4,-96(%rdi)
+ leaq -8(%r10),%rsp
+
+ vzeroupper
+ ret
+
+.Lblocks_avx512:
+
+ movl $15,%eax
+ kmovw %eax,%k2
+ leaq 8(%rsp),%r10
+
+ subq $0x128,%rsp
+ leaq .Lconst(%rip),%rcx
+ leaq 48+64(%rdi),%rdi
+ vmovdqa 96(%rcx),%ymm9
+
+ vmovdqu32 -64(%rdi),%zmm16{%k2}{z}
+ andq $-512,%rsp
+ vmovdqu32 -48(%rdi),%zmm17{%k2}{z}
+ movq $0x20,%rax
+ vmovdqu32 -32(%rdi),%zmm21{%k2}{z}
+ vmovdqu32 -16(%rdi),%zmm18{%k2}{z}
+ vmovdqu32 0(%rdi),%zmm22{%k2}{z}
+ vmovdqu32 16(%rdi),%zmm19{%k2}{z}
+ vmovdqu32 32(%rdi),%zmm23{%k2}{z}
+ vmovdqu32 48(%rdi),%zmm20{%k2}{z}
+ vmovdqu32 64(%rdi),%zmm24{%k2}{z}
+ vpermd %zmm16,%zmm9,%zmm16
+ vpbroadcastq 64(%rcx),%zmm5
+ vpermd %zmm17,%zmm9,%zmm17
+ vpermd %zmm21,%zmm9,%zmm21
+ vpermd %zmm18,%zmm9,%zmm18
+ vmovdqa64 %zmm16,0(%rsp){%k2}
+ vpsrlq $32,%zmm16,%zmm7
+ vpermd %zmm22,%zmm9,%zmm22
+ vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2}
+ vpsrlq $32,%zmm17,%zmm8
+ vpermd %zmm19,%zmm9,%zmm19
+ vmovdqa64 %zmm21,64(%rsp){%k2}
+ vpermd %zmm23,%zmm9,%zmm23
+ vpermd %zmm20,%zmm9,%zmm20
+ vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2}
+ vpermd %zmm24,%zmm9,%zmm24
+ vmovdqa64 %zmm22,128(%rsp){%k2}
+ vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2}
+ vmovdqa64 %zmm23,192(%rsp){%k2}
+ vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2}
+ vmovdqa64 %zmm24,256(%rsp){%k2}
+
+ vpmuludq %zmm7,%zmm16,%zmm11
+ vpmuludq %zmm7,%zmm17,%zmm12
+ vpmuludq %zmm7,%zmm18,%zmm13
+ vpmuludq %zmm7,%zmm19,%zmm14
+ vpmuludq %zmm7,%zmm20,%zmm15
+ vpsrlq $32,%zmm18,%zmm9
+
+ vpmuludq %zmm8,%zmm24,%zmm25
+ vpmuludq %zmm8,%zmm16,%zmm26
+ vpmuludq %zmm8,%zmm17,%zmm27
+ vpmuludq %zmm8,%zmm18,%zmm28
+ vpmuludq %zmm8,%zmm19,%zmm29
+ vpsrlq $32,%zmm19,%zmm10
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+
+ vpmuludq %zmm9,%zmm23,%zmm25
+ vpmuludq %zmm9,%zmm24,%zmm26
+ vpmuludq %zmm9,%zmm17,%zmm28
+ vpmuludq %zmm9,%zmm18,%zmm29
+ vpmuludq %zmm9,%zmm16,%zmm27
+ vpsrlq $32,%zmm20,%zmm6
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm27,%zmm13,%zmm13
+
+ vpmuludq %zmm10,%zmm22,%zmm25
+ vpmuludq %zmm10,%zmm16,%zmm28
+ vpmuludq %zmm10,%zmm17,%zmm29
+ vpmuludq %zmm10,%zmm23,%zmm26
+ vpmuludq %zmm10,%zmm24,%zmm27
+ vpaddq %zmm25,%zmm11,%zmm11
+ vpaddq %zmm28,%zmm14,%zmm14
+ vpaddq %zmm29,%zmm15,%zmm15
+ vpaddq %zmm26,%zmm12,%zmm12
+ vpaddq %zmm27,%zmm13,%zmm13
vpmuludq %zmm6,%zmm24,%zmm28
vpmuludq %zmm6,%zmm16,%zmm29
@@ -2048,15 +2419,10 @@ poly1305_blocks_avx512:
vpaddq %zmm26,%zmm12,%zmm12
vpaddq %zmm27,%zmm13,%zmm13
-
-
vmovdqu64 0(%rsi),%zmm10
vmovdqu64 64(%rsi),%zmm6
leaq 128(%rsi),%rsi
-
-
-
vpsrlq $26,%zmm14,%zmm28
vpandq %zmm5,%zmm14,%zmm14
vpaddq %zmm28,%zmm15,%zmm15
@@ -2088,18 +2454,9 @@ poly1305_blocks_avx512:
vpandq %zmm5,%zmm14,%zmm14
vpaddq %zmm28,%zmm15,%zmm15
-
-
-
-
vpunpcklqdq %zmm6,%zmm10,%zmm7
vpunpckhqdq %zmm6,%zmm10,%zmm6
-
-
-
-
-
vmovdqa32 128(%rcx),%zmm25
movl $0x7777,%eax
kmovw %eax,%k1
@@ -2136,9 +2493,6 @@ poly1305_blocks_avx512:
vpandq %zmm5,%zmm9,%zmm9
vpandq %zmm5,%zmm7,%zmm7
-
-
-
vpaddq %zmm2,%zmm9,%zmm2
subq $192,%rdx
jbe .Ltail_avx512
@@ -2147,33 +2501,6 @@ poly1305_blocks_avx512:
.align 32
.Loop_avx512:
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
vpmuludq %zmm2,%zmm17,%zmm14
vpaddq %zmm0,%zmm7,%zmm0
vpmuludq %zmm2,%zmm18,%zmm15
@@ -2238,9 +2565,6 @@ poly1305_blocks_avx512:
vpaddq %zmm26,%zmm12,%zmm1
vpaddq %zmm27,%zmm13,%zmm2
-
-
-
vpsrlq $52,%zmm7,%zmm9
vpsllq $12,%zmm6,%zmm10
@@ -2288,18 +2612,11 @@ poly1305_blocks_avx512:
vpandq %zmm5,%zmm7,%zmm7
-
-
-
subq $128,%rdx
ja .Loop_avx512
.Ltail_avx512:
-
-
-
-
vpsrlq $32,%zmm16,%zmm16
vpsrlq $32,%zmm17,%zmm17
vpsrlq $32,%zmm18,%zmm18
@@ -2310,11 +2627,8 @@ poly1305_blocks_avx512:
vpsrlq $32,%zmm21,%zmm21
vpsrlq $32,%zmm22,%zmm22
-
-
leaq (%rsi,%rdx,1),%rsi
-
vpaddq %zmm0,%zmm7,%zmm0
vpmuludq %zmm2,%zmm17,%zmm14
@@ -2378,9 +2692,6 @@ poly1305_blocks_avx512:
vpaddq %zmm26,%zmm12,%zmm1
vpaddq %zmm27,%zmm13,%zmm2
-
-
-
movl $1,%eax
vpermq $0xb1,%zmm3,%zmm14
vpermq $0xb1,%zmm15,%zmm4
@@ -2416,8 +2727,6 @@ poly1305_blocks_avx512:
vpaddq %zmm12,%zmm1,%zmm1{%k3}{z}
vpaddq %zmm13,%zmm2,%zmm2{%k3}{z}
-
-
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpsrldq $6,%ymm7,%ymm9
@@ -2466,7 +2775,7 @@ poly1305_blocks_avx512:
leaq 144(%rsp),%rax
addq $64,%rdx
- jnz .Ltail_avx2
+ jnz .Ltail_avx2_512
vpsubq %ymm9,%ymm2,%ymm2
vmovd %xmm0,-112(%rdi)
@@ -2475,1091 +2784,9 @@ poly1305_blocks_avx512:
vmovd %xmm3,-100(%rdi)
vmovd %xmm4,-96(%rdi)
vzeroall
- leaq 8(%r11),%rsp
-.cfi_def_cfa %rsp,8
- .byte 0xf3,0xc3
-.cfi_endproc
-.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
-.type poly1305_init_base2_44,@function
-.align 32
-poly1305_init_base2_44:
- xorq %rax,%rax
- movq %rax,0(%rdi)
- movq %rax,8(%rdi)
- movq %rax,16(%rdi)
-
-.Linit_base2_44:
- leaq poly1305_blocks_vpmadd52(%rip),%r10
- leaq poly1305_emit_base2_44(%rip),%r11
-
- movq $0x0ffffffc0fffffff,%rax
- movq $0x0ffffffc0ffffffc,%rcx
- andq 0(%rsi),%rax
- movq $0x00000fffffffffff,%r8
- andq 8(%rsi),%rcx
- movq $0x00000fffffffffff,%r9
- andq %rax,%r8
- shrdq $44,%rcx,%rax
- movq %r8,40(%rdi)
- andq %r9,%rax
- shrq $24,%rcx
- movq %rax,48(%rdi)
- leaq (%rax,%rax,4),%rax
- movq %rcx,56(%rdi)
- shlq $2,%rax
- leaq (%rcx,%rcx,4),%rcx
- shlq $2,%rcx
- movq %rax,24(%rdi)
- movq %rcx,32(%rdi)
- movq $-1,64(%rdi)
- movq %r10,0(%rdx)
- movq %r11,8(%rdx)
- movl $1,%eax
- .byte 0xf3,0xc3
-.size poly1305_init_base2_44,.-poly1305_init_base2_44
-.type poly1305_blocks_vpmadd52,@function
-.align 32
-poly1305_blocks_vpmadd52:
- shrq $4,%rdx
- jz .Lno_data_vpmadd52
-
- shlq $40,%rcx
- movq 64(%rdi),%r8
-
-
-
-
-
-
- movq $3,%rax
- movq $1,%r10
- cmpq $4,%rdx
- cmovaeq %r10,%rax
- testq %r8,%r8
- cmovnsq %r10,%rax
-
- andq %rdx,%rax
- jz .Lblocks_vpmadd52_4x
-
- subq %rax,%rdx
- movl $7,%r10d
- movl $1,%r11d
- kmovw %r10d,%k7
- leaq .L2_44_inp_permd(%rip),%r10
- kmovw %r11d,%k1
-
- vmovq %rcx,%xmm21
- vmovdqa64 0(%r10),%ymm19
- vmovdqa64 32(%r10),%ymm20
- vpermq $0xcf,%ymm21,%ymm21
- vmovdqa64 64(%r10),%ymm22
-
- vmovdqu64 0(%rdi),%ymm16{%k7}{z}
- vmovdqu64 40(%rdi),%ymm3{%k7}{z}
- vmovdqu64 32(%rdi),%ymm4{%k7}{z}
- vmovdqu64 24(%rdi),%ymm5{%k7}{z}
-
- vmovdqa64 96(%r10),%ymm23
- vmovdqa64 128(%r10),%ymm24
-
- jmp .Loop_vpmadd52
-
-.align 32
-.Loop_vpmadd52:
- vmovdqu32 0(%rsi),%xmm18
- leaq 16(%rsi),%rsi
-
- vpermd %ymm18,%ymm19,%ymm18
- vpsrlvq %ymm20,%ymm18,%ymm18
- vpandq %ymm22,%ymm18,%ymm18
- vporq %ymm21,%ymm18,%ymm18
-
- vpaddq %ymm18,%ymm16,%ymm16
-
- vpermq $0,%ymm16,%ymm0{%k7}{z}
- vpermq $85,%ymm16,%ymm1{%k7}{z}
- vpermq $170,%ymm16,%ymm2{%k7}{z}
-
- vpxord %ymm16,%ymm16,%ymm16
- vpxord %ymm17,%ymm17,%ymm17
-
- vpmadd52luq %ymm3,%ymm0,%ymm16
- vpmadd52huq %ymm3,%ymm0,%ymm17
-
- vpmadd52luq %ymm4,%ymm1,%ymm16
- vpmadd52huq %ymm4,%ymm1,%ymm17
-
- vpmadd52luq %ymm5,%ymm2,%ymm16
- vpmadd52huq %ymm5,%ymm2,%ymm17
-
- vpsrlvq %ymm23,%ymm16,%ymm18
- vpsllvq %ymm24,%ymm17,%ymm17
- vpandq %ymm22,%ymm16,%ymm16
-
- vpaddq %ymm18,%ymm17,%ymm17
-
- vpermq $147,%ymm17,%ymm17
-
- vpaddq %ymm17,%ymm16,%ymm16
-
- vpsrlvq %ymm23,%ymm16,%ymm18
- vpandq %ymm22,%ymm16,%ymm16
-
- vpermq $147,%ymm18,%ymm18
-
- vpaddq %ymm18,%ymm16,%ymm16
-
- vpermq $147,%ymm16,%ymm18{%k1}{z}
-
- vpaddq %ymm18,%ymm16,%ymm16
- vpsllq $2,%ymm18,%ymm18
-
- vpaddq %ymm18,%ymm16,%ymm16
-
- decq %rax
- jnz .Loop_vpmadd52
-
- vmovdqu64 %ymm16,0(%rdi){%k7}
-
- testq %rdx,%rdx
- jnz .Lblocks_vpmadd52_4x
-
-.Lno_data_vpmadd52:
- .byte 0xf3,0xc3
-.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
-.type poly1305_blocks_vpmadd52_4x,@function
-.align 32
-poly1305_blocks_vpmadd52_4x:
- shrq $4,%rdx
- jz .Lno_data_vpmadd52_4x
-
- shlq $40,%rcx
- movq 64(%rdi),%r8
-
-.Lblocks_vpmadd52_4x:
- vpbroadcastq %rcx,%ymm31
-
- vmovdqa64 .Lx_mask44(%rip),%ymm28
- movl $5,%eax
- vmovdqa64 .Lx_mask42(%rip),%ymm29
- kmovw %eax,%k1
-
- testq %r8,%r8
- js .Linit_vpmadd52
-
- vmovq 0(%rdi),%xmm0
- vmovq 8(%rdi),%xmm1
- vmovq 16(%rdi),%xmm2
-
- testq $3,%rdx
- jnz .Lblocks_vpmadd52_2x_do
-
-.Lblocks_vpmadd52_4x_do:
- vpbroadcastq 64(%rdi),%ymm3
- vpbroadcastq 96(%rdi),%ymm4
- vpbroadcastq 128(%rdi),%ymm5
- vpbroadcastq 160(%rdi),%ymm16
-
-.Lblocks_vpmadd52_4x_key_loaded:
- vpsllq $2,%ymm5,%ymm17
- vpaddq %ymm5,%ymm17,%ymm17
- vpsllq $2,%ymm17,%ymm17
-
- testq $7,%rdx
- jz .Lblocks_vpmadd52_8x
-
- vmovdqu64 0(%rsi),%ymm26
- vmovdqu64 32(%rsi),%ymm27
- leaq 64(%rsi),%rsi
-
- vpunpcklqdq %ymm27,%ymm26,%ymm25
- vpunpckhqdq %ymm27,%ymm26,%ymm27
-
-
-
- vpsrlq $24,%ymm27,%ymm26
- vporq %ymm31,%ymm26,%ymm26
- vpaddq %ymm26,%ymm2,%ymm2
- vpandq %ymm28,%ymm25,%ymm24
- vpsrlq $44,%ymm25,%ymm25
- vpsllq $20,%ymm27,%ymm27
- vporq %ymm27,%ymm25,%ymm25
- vpandq %ymm28,%ymm25,%ymm25
-
- subq $4,%rdx
- jz .Ltail_vpmadd52_4x
- jmp .Loop_vpmadd52_4x
- ud2
-
-.align 32
-.Linit_vpmadd52:
- vmovq 24(%rdi),%xmm16
- vmovq 56(%rdi),%xmm2
- vmovq 32(%rdi),%xmm17
- vmovq 40(%rdi),%xmm3
- vmovq 48(%rdi),%xmm4
-
- vmovdqa %ymm3,%ymm0
- vmovdqa %ymm4,%ymm1
- vmovdqa %ymm2,%ymm5
-
- movl $2,%eax
-
-.Lmul_init_vpmadd52:
- vpxorq %ymm18,%ymm18,%ymm18
- vpmadd52luq %ymm2,%ymm16,%ymm18
- vpxorq %ymm19,%ymm19,%ymm19
- vpmadd52huq %ymm2,%ymm16,%ymm19
- vpxorq %ymm20,%ymm20,%ymm20
- vpmadd52luq %ymm2,%ymm17,%ymm20
- vpxorq %ymm21,%ymm21,%ymm21
- vpmadd52huq %ymm2,%ymm17,%ymm21
- vpxorq %ymm22,%ymm22,%ymm22
- vpmadd52luq %ymm2,%ymm3,%ymm22
- vpxorq %ymm23,%ymm23,%ymm23
- vpmadd52huq %ymm2,%ymm3,%ymm23
-
- vpmadd52luq %ymm0,%ymm3,%ymm18
- vpmadd52huq %ymm0,%ymm3,%ymm19
- vpmadd52luq %ymm0,%ymm4,%ymm20
- vpmadd52huq %ymm0,%ymm4,%ymm21
- vpmadd52luq %ymm0,%ymm5,%ymm22
- vpmadd52huq %ymm0,%ymm5,%ymm23
-
- vpmadd52luq %ymm1,%ymm17,%ymm18
- vpmadd52huq %ymm1,%ymm17,%ymm19
- vpmadd52luq %ymm1,%ymm3,%ymm20
- vpmadd52huq %ymm1,%ymm3,%ymm21
- vpmadd52luq %ymm1,%ymm4,%ymm22
- vpmadd52huq %ymm1,%ymm4,%ymm23
-
-
-
- vpsrlq $44,%ymm18,%ymm30
- vpsllq $8,%ymm19,%ymm19
- vpandq %ymm28,%ymm18,%ymm0
- vpaddq %ymm30,%ymm19,%ymm19
-
- vpaddq %ymm19,%ymm20,%ymm20
-
- vpsrlq $44,%ymm20,%ymm30
- vpsllq $8,%ymm21,%ymm21
- vpandq %ymm28,%ymm20,%ymm1
- vpaddq %ymm30,%ymm21,%ymm21
-
- vpaddq %ymm21,%ymm22,%ymm22
-
- vpsrlq $42,%ymm22,%ymm30
- vpsllq $10,%ymm23,%ymm23
- vpandq %ymm29,%ymm22,%ymm2
- vpaddq %ymm30,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm0,%ymm0
- vpsllq $2,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm0,%ymm0
-
- vpsrlq $44,%ymm0,%ymm30
- vpandq %ymm28,%ymm0,%ymm0
-
- vpaddq %ymm30,%ymm1,%ymm1
-
- decl %eax
- jz .Ldone_init_vpmadd52
-
- vpunpcklqdq %ymm4,%ymm1,%ymm4
- vpbroadcastq %xmm1,%xmm1
- vpunpcklqdq %ymm5,%ymm2,%ymm5
- vpbroadcastq %xmm2,%xmm2
- vpunpcklqdq %ymm3,%ymm0,%ymm3
- vpbroadcastq %xmm0,%xmm0
-
- vpsllq $2,%ymm4,%ymm16
- vpsllq $2,%ymm5,%ymm17
- vpaddq %ymm4,%ymm16,%ymm16
- vpaddq %ymm5,%ymm17,%ymm17
- vpsllq $2,%ymm16,%ymm16
- vpsllq $2,%ymm17,%ymm17
-
- jmp .Lmul_init_vpmadd52
- ud2
-
-.align 32
-.Ldone_init_vpmadd52:
- vinserti128 $1,%xmm4,%ymm1,%ymm4
- vinserti128 $1,%xmm5,%ymm2,%ymm5
- vinserti128 $1,%xmm3,%ymm0,%ymm3
-
- vpermq $216,%ymm4,%ymm4
- vpermq $216,%ymm5,%ymm5
- vpermq $216,%ymm3,%ymm3
-
- vpsllq $2,%ymm4,%ymm16
- vpaddq %ymm4,%ymm16,%ymm16
- vpsllq $2,%ymm16,%ymm16
-
- vmovq 0(%rdi),%xmm0
- vmovq 8(%rdi),%xmm1
- vmovq 16(%rdi),%xmm2
-
- testq $3,%rdx
- jnz .Ldone_init_vpmadd52_2x
-
- vmovdqu64 %ymm3,64(%rdi)
- vpbroadcastq %xmm3,%ymm3
- vmovdqu64 %ymm4,96(%rdi)
- vpbroadcastq %xmm4,%ymm4
- vmovdqu64 %ymm5,128(%rdi)
- vpbroadcastq %xmm5,%ymm5
- vmovdqu64 %ymm16,160(%rdi)
- vpbroadcastq %xmm16,%ymm16
-
- jmp .Lblocks_vpmadd52_4x_key_loaded
- ud2
-
-.align 32
-.Ldone_init_vpmadd52_2x:
- vmovdqu64 %ymm3,64(%rdi)
- vpsrldq $8,%ymm3,%ymm3
- vmovdqu64 %ymm4,96(%rdi)
- vpsrldq $8,%ymm4,%ymm4
- vmovdqu64 %ymm5,128(%rdi)
- vpsrldq $8,%ymm5,%ymm5
- vmovdqu64 %ymm16,160(%rdi)
- vpsrldq $8,%ymm16,%ymm16
- jmp .Lblocks_vpmadd52_2x_key_loaded
- ud2
-
-.align 32
-.Lblocks_vpmadd52_2x_do:
- vmovdqu64 128+8(%rdi),%ymm5{%k1}{z}
- vmovdqu64 160+8(%rdi),%ymm16{%k1}{z}
- vmovdqu64 64+8(%rdi),%ymm3{%k1}{z}
- vmovdqu64 96+8(%rdi),%ymm4{%k1}{z}
-
-.Lblocks_vpmadd52_2x_key_loaded:
- vmovdqu64 0(%rsi),%ymm26
- vpxorq %ymm27,%ymm27,%ymm27
- leaq 32(%rsi),%rsi
-
- vpunpcklqdq %ymm27,%ymm26,%ymm25
- vpunpckhqdq %ymm27,%ymm26,%ymm27
-
-
-
- vpsrlq $24,%ymm27,%ymm26
- vporq %ymm31,%ymm26,%ymm26
- vpaddq %ymm26,%ymm2,%ymm2
- vpandq %ymm28,%ymm25,%ymm24
- vpsrlq $44,%ymm25,%ymm25
- vpsllq $20,%ymm27,%ymm27
- vporq %ymm27,%ymm25,%ymm25
- vpandq %ymm28,%ymm25,%ymm25
-
- jmp .Ltail_vpmadd52_2x
- ud2
-
-.align 32
-.Loop_vpmadd52_4x:
-
- vpaddq %ymm24,%ymm0,%ymm0
- vpaddq %ymm25,%ymm1,%ymm1
-
- vpxorq %ymm18,%ymm18,%ymm18
- vpmadd52luq %ymm2,%ymm16,%ymm18
- vpxorq %ymm19,%ymm19,%ymm19
- vpmadd52huq %ymm2,%ymm16,%ymm19
- vpxorq %ymm20,%ymm20,%ymm20
- vpmadd52luq %ymm2,%ymm17,%ymm20
- vpxorq %ymm21,%ymm21,%ymm21
- vpmadd52huq %ymm2,%ymm17,%ymm21
- vpxorq %ymm22,%ymm22,%ymm22
- vpmadd52luq %ymm2,%ymm3,%ymm22
- vpxorq %ymm23,%ymm23,%ymm23
- vpmadd52huq %ymm2,%ymm3,%ymm23
-
- vmovdqu64 0(%rsi),%ymm26
- vmovdqu64 32(%rsi),%ymm27
- leaq 64(%rsi),%rsi
- vpmadd52luq %ymm0,%ymm3,%ymm18
- vpmadd52huq %ymm0,%ymm3,%ymm19
- vpmadd52luq %ymm0,%ymm4,%ymm20
- vpmadd52huq %ymm0,%ymm4,%ymm21
- vpmadd52luq %ymm0,%ymm5,%ymm22
- vpmadd52huq %ymm0,%ymm5,%ymm23
-
- vpunpcklqdq %ymm27,%ymm26,%ymm25
- vpunpckhqdq %ymm27,%ymm26,%ymm27
- vpmadd52luq %ymm1,%ymm17,%ymm18
- vpmadd52huq %ymm1,%ymm17,%ymm19
- vpmadd52luq %ymm1,%ymm3,%ymm20
- vpmadd52huq %ymm1,%ymm3,%ymm21
- vpmadd52luq %ymm1,%ymm4,%ymm22
- vpmadd52huq %ymm1,%ymm4,%ymm23
-
-
-
- vpsrlq $44,%ymm18,%ymm30
- vpsllq $8,%ymm19,%ymm19
- vpandq %ymm28,%ymm18,%ymm0
- vpaddq %ymm30,%ymm19,%ymm19
-
- vpsrlq $24,%ymm27,%ymm26
- vporq %ymm31,%ymm26,%ymm26
- vpaddq %ymm19,%ymm20,%ymm20
-
- vpsrlq $44,%ymm20,%ymm30
- vpsllq $8,%ymm21,%ymm21
- vpandq %ymm28,%ymm20,%ymm1
- vpaddq %ymm30,%ymm21,%ymm21
-
- vpandq %ymm28,%ymm25,%ymm24
- vpsrlq $44,%ymm25,%ymm25
- vpsllq $20,%ymm27,%ymm27
- vpaddq %ymm21,%ymm22,%ymm22
-
- vpsrlq $42,%ymm22,%ymm30
- vpsllq $10,%ymm23,%ymm23
- vpandq %ymm29,%ymm22,%ymm2
- vpaddq %ymm30,%ymm23,%ymm23
-
- vpaddq %ymm26,%ymm2,%ymm2
- vpaddq %ymm23,%ymm0,%ymm0
- vpsllq $2,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm0,%ymm0
- vporq %ymm27,%ymm25,%ymm25
- vpandq %ymm28,%ymm25,%ymm25
-
- vpsrlq $44,%ymm0,%ymm30
- vpandq %ymm28,%ymm0,%ymm0
-
- vpaddq %ymm30,%ymm1,%ymm1
-
- subq $4,%rdx
- jnz .Loop_vpmadd52_4x
-
-.Ltail_vpmadd52_4x:
- vmovdqu64 128(%rdi),%ymm5
- vmovdqu64 160(%rdi),%ymm16
- vmovdqu64 64(%rdi),%ymm3
- vmovdqu64 96(%rdi),%ymm4
-
-.Ltail_vpmadd52_2x:
- vpsllq $2,%ymm5,%ymm17
- vpaddq %ymm5,%ymm17,%ymm17
- vpsllq $2,%ymm17,%ymm17
-
-
- vpaddq %ymm24,%ymm0,%ymm0
- vpaddq %ymm25,%ymm1,%ymm1
-
- vpxorq %ymm18,%ymm18,%ymm18
- vpmadd52luq %ymm2,%ymm16,%ymm18
- vpxorq %ymm19,%ymm19,%ymm19
- vpmadd52huq %ymm2,%ymm16,%ymm19
- vpxorq %ymm20,%ymm20,%ymm20
- vpmadd52luq %ymm2,%ymm17,%ymm20
- vpxorq %ymm21,%ymm21,%ymm21
- vpmadd52huq %ymm2,%ymm17,%ymm21
- vpxorq %ymm22,%ymm22,%ymm22
- vpmadd52luq %ymm2,%ymm3,%ymm22
- vpxorq %ymm23,%ymm23,%ymm23
- vpmadd52huq %ymm2,%ymm3,%ymm23
-
- vpmadd52luq %ymm0,%ymm3,%ymm18
- vpmadd52huq %ymm0,%ymm3,%ymm19
- vpmadd52luq %ymm0,%ymm4,%ymm20
- vpmadd52huq %ymm0,%ymm4,%ymm21
- vpmadd52luq %ymm0,%ymm5,%ymm22
- vpmadd52huq %ymm0,%ymm5,%ymm23
-
- vpmadd52luq %ymm1,%ymm17,%ymm18
- vpmadd52huq %ymm1,%ymm17,%ymm19
- vpmadd52luq %ymm1,%ymm3,%ymm20
- vpmadd52huq %ymm1,%ymm3,%ymm21
- vpmadd52luq %ymm1,%ymm4,%ymm22
- vpmadd52huq %ymm1,%ymm4,%ymm23
-
-
-
-
- movl $1,%eax
- kmovw %eax,%k1
- vpsrldq $8,%ymm18,%ymm24
- vpsrldq $8,%ymm19,%ymm0
- vpsrldq $8,%ymm20,%ymm25
- vpsrldq $8,%ymm21,%ymm1
- vpaddq %ymm24,%ymm18,%ymm18
- vpaddq %ymm0,%ymm19,%ymm19
- vpsrldq $8,%ymm22,%ymm26
- vpsrldq $8,%ymm23,%ymm2
- vpaddq %ymm25,%ymm20,%ymm20
- vpaddq %ymm1,%ymm21,%ymm21
- vpermq $0x2,%ymm18,%ymm24
- vpermq $0x2,%ymm19,%ymm0
- vpaddq %ymm26,%ymm22,%ymm22
- vpaddq %ymm2,%ymm23,%ymm23
-
- vpermq $0x2,%ymm20,%ymm25
- vpermq $0x2,%ymm21,%ymm1
- vpaddq %ymm24,%ymm18,%ymm18{%k1}{z}
- vpaddq %ymm0,%ymm19,%ymm19{%k1}{z}
- vpermq $0x2,%ymm22,%ymm26
- vpermq $0x2,%ymm23,%ymm2
- vpaddq %ymm25,%ymm20,%ymm20{%k1}{z}
- vpaddq %ymm1,%ymm21,%ymm21{%k1}{z}
- vpaddq %ymm26,%ymm22,%ymm22{%k1}{z}
- vpaddq %ymm2,%ymm23,%ymm23{%k1}{z}
-
-
-
- vpsrlq $44,%ymm18,%ymm30
- vpsllq $8,%ymm19,%ymm19
- vpandq %ymm28,%ymm18,%ymm0
- vpaddq %ymm30,%ymm19,%ymm19
-
- vpaddq %ymm19,%ymm20,%ymm20
-
- vpsrlq $44,%ymm20,%ymm30
- vpsllq $8,%ymm21,%ymm21
- vpandq %ymm28,%ymm20,%ymm1
- vpaddq %ymm30,%ymm21,%ymm21
-
- vpaddq %ymm21,%ymm22,%ymm22
-
- vpsrlq $42,%ymm22,%ymm30
- vpsllq $10,%ymm23,%ymm23
- vpandq %ymm29,%ymm22,%ymm2
- vpaddq %ymm30,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm0,%ymm0
- vpsllq $2,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm0,%ymm0
-
- vpsrlq $44,%ymm0,%ymm30
- vpandq %ymm28,%ymm0,%ymm0
-
- vpaddq %ymm30,%ymm1,%ymm1
-
-
- subq $2,%rdx
- ja .Lblocks_vpmadd52_4x_do
-
- vmovq %xmm0,0(%rdi)
- vmovq %xmm1,8(%rdi)
- vmovq %xmm2,16(%rdi)
- vzeroall
-
-.Lno_data_vpmadd52_4x:
- .byte 0xf3,0xc3
-.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
-.type poly1305_blocks_vpmadd52_8x,@function
-.align 32
-poly1305_blocks_vpmadd52_8x:
- shrq $4,%rdx
- jz .Lno_data_vpmadd52_8x
-
- shlq $40,%rcx
- movq 64(%rdi),%r8
-
- vmovdqa64 .Lx_mask44(%rip),%ymm28
- vmovdqa64 .Lx_mask42(%rip),%ymm29
-
- testq %r8,%r8
- js .Linit_vpmadd52
-
- vmovq 0(%rdi),%xmm0
- vmovq 8(%rdi),%xmm1
- vmovq 16(%rdi),%xmm2
-
-.Lblocks_vpmadd52_8x:
-
-
-
- vmovdqu64 128(%rdi),%ymm5
- vmovdqu64 160(%rdi),%ymm16
- vmovdqu64 64(%rdi),%ymm3
- vmovdqu64 96(%rdi),%ymm4
-
- vpsllq $2,%ymm5,%ymm17
- vpaddq %ymm5,%ymm17,%ymm17
- vpsllq $2,%ymm17,%ymm17
-
- vpbroadcastq %xmm5,%ymm8
- vpbroadcastq %xmm3,%ymm6
- vpbroadcastq %xmm4,%ymm7
-
- vpxorq %ymm18,%ymm18,%ymm18
- vpmadd52luq %ymm8,%ymm16,%ymm18
- vpxorq %ymm19,%ymm19,%ymm19
- vpmadd52huq %ymm8,%ymm16,%ymm19
- vpxorq %ymm20,%ymm20,%ymm20
- vpmadd52luq %ymm8,%ymm17,%ymm20
- vpxorq %ymm21,%ymm21,%ymm21
- vpmadd52huq %ymm8,%ymm17,%ymm21
- vpxorq %ymm22,%ymm22,%ymm22
- vpmadd52luq %ymm8,%ymm3,%ymm22
- vpxorq %ymm23,%ymm23,%ymm23
- vpmadd52huq %ymm8,%ymm3,%ymm23
-
- vpmadd52luq %ymm6,%ymm3,%ymm18
- vpmadd52huq %ymm6,%ymm3,%ymm19
- vpmadd52luq %ymm6,%ymm4,%ymm20
- vpmadd52huq %ymm6,%ymm4,%ymm21
- vpmadd52luq %ymm6,%ymm5,%ymm22
- vpmadd52huq %ymm6,%ymm5,%ymm23
-
- vpmadd52luq %ymm7,%ymm17,%ymm18
- vpmadd52huq %ymm7,%ymm17,%ymm19
- vpmadd52luq %ymm7,%ymm3,%ymm20
- vpmadd52huq %ymm7,%ymm3,%ymm21
- vpmadd52luq %ymm7,%ymm4,%ymm22
- vpmadd52huq %ymm7,%ymm4,%ymm23
-
-
-
- vpsrlq $44,%ymm18,%ymm30
- vpsllq $8,%ymm19,%ymm19
- vpandq %ymm28,%ymm18,%ymm6
- vpaddq %ymm30,%ymm19,%ymm19
-
- vpaddq %ymm19,%ymm20,%ymm20
-
- vpsrlq $44,%ymm20,%ymm30
- vpsllq $8,%ymm21,%ymm21
- vpandq %ymm28,%ymm20,%ymm7
- vpaddq %ymm30,%ymm21,%ymm21
-
- vpaddq %ymm21,%ymm22,%ymm22
-
- vpsrlq $42,%ymm22,%ymm30
- vpsllq $10,%ymm23,%ymm23
- vpandq %ymm29,%ymm22,%ymm8
- vpaddq %ymm30,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm6,%ymm6
- vpsllq $2,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm6,%ymm6
-
- vpsrlq $44,%ymm6,%ymm30
- vpandq %ymm28,%ymm6,%ymm6
-
- vpaddq %ymm30,%ymm7,%ymm7
-
-
-
-
-
- vpunpcklqdq %ymm5,%ymm8,%ymm26
- vpunpckhqdq %ymm5,%ymm8,%ymm5
- vpunpcklqdq %ymm3,%ymm6,%ymm24
- vpunpckhqdq %ymm3,%ymm6,%ymm3
- vpunpcklqdq %ymm4,%ymm7,%ymm25
- vpunpckhqdq %ymm4,%ymm7,%ymm4
- vshufi64x2 $0x44,%zmm5,%zmm26,%zmm8
- vshufi64x2 $0x44,%zmm3,%zmm24,%zmm6
- vshufi64x2 $0x44,%zmm4,%zmm25,%zmm7
-
- vmovdqu64 0(%rsi),%zmm26
- vmovdqu64 64(%rsi),%zmm27
- leaq 128(%rsi),%rsi
-
- vpsllq $2,%zmm8,%zmm10
- vpsllq $2,%zmm7,%zmm9
- vpaddq %zmm8,%zmm10,%zmm10
- vpaddq %zmm7,%zmm9,%zmm9
- vpsllq $2,%zmm10,%zmm10
- vpsllq $2,%zmm9,%zmm9
-
- vpbroadcastq %rcx,%zmm31
- vpbroadcastq %xmm28,%zmm28
- vpbroadcastq %xmm29,%zmm29
-
- vpbroadcastq %xmm9,%zmm16
- vpbroadcastq %xmm10,%zmm17
- vpbroadcastq %xmm6,%zmm3
- vpbroadcastq %xmm7,%zmm4
- vpbroadcastq %xmm8,%zmm5
-
- vpunpcklqdq %zmm27,%zmm26,%zmm25
- vpunpckhqdq %zmm27,%zmm26,%zmm27
-
-
-
- vpsrlq $24,%zmm27,%zmm26
- vporq %zmm31,%zmm26,%zmm26
- vpaddq %zmm26,%zmm2,%zmm2
- vpandq %zmm28,%zmm25,%zmm24
- vpsrlq $44,%zmm25,%zmm25
- vpsllq $20,%zmm27,%zmm27
- vporq %zmm27,%zmm25,%zmm25
- vpandq %zmm28,%zmm25,%zmm25
-
- subq $8,%rdx
- jz .Ltail_vpmadd52_8x
- jmp .Loop_vpmadd52_8x
-
-.align 32
-.Loop_vpmadd52_8x:
-
- vpaddq %zmm24,%zmm0,%zmm0
- vpaddq %zmm25,%zmm1,%zmm1
-
- vpxorq %zmm18,%zmm18,%zmm18
- vpmadd52luq %zmm2,%zmm16,%zmm18
- vpxorq %zmm19,%zmm19,%zmm19
- vpmadd52huq %zmm2,%zmm16,%zmm19
- vpxorq %zmm20,%zmm20,%zmm20
- vpmadd52luq %zmm2,%zmm17,%zmm20
- vpxorq %zmm21,%zmm21,%zmm21
- vpmadd52huq %zmm2,%zmm17,%zmm21
- vpxorq %zmm22,%zmm22,%zmm22
- vpmadd52luq %zmm2,%zmm3,%zmm22
- vpxorq %zmm23,%zmm23,%zmm23
- vpmadd52huq %zmm2,%zmm3,%zmm23
-
- vmovdqu64 0(%rsi),%zmm26
- vmovdqu64 64(%rsi),%zmm27
- leaq 128(%rsi),%rsi
- vpmadd52luq %zmm0,%zmm3,%zmm18
- vpmadd52huq %zmm0,%zmm3,%zmm19
- vpmadd52luq %zmm0,%zmm4,%zmm20
- vpmadd52huq %zmm0,%zmm4,%zmm21
- vpmadd52luq %zmm0,%zmm5,%zmm22
- vpmadd52huq %zmm0,%zmm5,%zmm23
-
- vpunpcklqdq %zmm27,%zmm26,%zmm25
- vpunpckhqdq %zmm27,%zmm26,%zmm27
- vpmadd52luq %zmm1,%zmm17,%zmm18
- vpmadd52huq %zmm1,%zmm17,%zmm19
- vpmadd52luq %zmm1,%zmm3,%zmm20
- vpmadd52huq %zmm1,%zmm3,%zmm21
- vpmadd52luq %zmm1,%zmm4,%zmm22
- vpmadd52huq %zmm1,%zmm4,%zmm23
-
-
-
- vpsrlq $44,%zmm18,%zmm30
- vpsllq $8,%zmm19,%zmm19
- vpandq %zmm28,%zmm18,%zmm0
- vpaddq %zmm30,%zmm19,%zmm19
-
- vpsrlq $24,%zmm27,%zmm26
- vporq %zmm31,%zmm26,%zmm26
- vpaddq %zmm19,%zmm20,%zmm20
-
- vpsrlq $44,%zmm20,%zmm30
- vpsllq $8,%zmm21,%zmm21
- vpandq %zmm28,%zmm20,%zmm1
- vpaddq %zmm30,%zmm21,%zmm21
-
- vpandq %zmm28,%zmm25,%zmm24
- vpsrlq $44,%zmm25,%zmm25
- vpsllq $20,%zmm27,%zmm27
- vpaddq %zmm21,%zmm22,%zmm22
-
- vpsrlq $42,%zmm22,%zmm30
- vpsllq $10,%zmm23,%zmm23
- vpandq %zmm29,%zmm22,%zmm2
- vpaddq %zmm30,%zmm23,%zmm23
-
- vpaddq %zmm26,%zmm2,%zmm2
- vpaddq %zmm23,%zmm0,%zmm0
- vpsllq $2,%zmm23,%zmm23
-
- vpaddq %zmm23,%zmm0,%zmm0
- vporq %zmm27,%zmm25,%zmm25
- vpandq %zmm28,%zmm25,%zmm25
-
- vpsrlq $44,%zmm0,%zmm30
- vpandq %zmm28,%zmm0,%zmm0
-
- vpaddq %zmm30,%zmm1,%zmm1
-
- subq $8,%rdx
- jnz .Loop_vpmadd52_8x
-
-.Ltail_vpmadd52_8x:
-
- vpaddq %zmm24,%zmm0,%zmm0
- vpaddq %zmm25,%zmm1,%zmm1
-
- vpxorq %zmm18,%zmm18,%zmm18
- vpmadd52luq %zmm2,%zmm9,%zmm18
- vpxorq %zmm19,%zmm19,%zmm19
- vpmadd52huq %zmm2,%zmm9,%zmm19
- vpxorq %zmm20,%zmm20,%zmm20
- vpmadd52luq %zmm2,%zmm10,%zmm20
- vpxorq %zmm21,%zmm21,%zmm21
- vpmadd52huq %zmm2,%zmm10,%zmm21
- vpxorq %zmm22,%zmm22,%zmm22
- vpmadd52luq %zmm2,%zmm6,%zmm22
- vpxorq %zmm23,%zmm23,%zmm23
- vpmadd52huq %zmm2,%zmm6,%zmm23
-
- vpmadd52luq %zmm0,%zmm6,%zmm18
- vpmadd52huq %zmm0,%zmm6,%zmm19
- vpmadd52luq %zmm0,%zmm7,%zmm20
- vpmadd52huq %zmm0,%zmm7,%zmm21
- vpmadd52luq %zmm0,%zmm8,%zmm22
- vpmadd52huq %zmm0,%zmm8,%zmm23
-
- vpmadd52luq %zmm1,%zmm10,%zmm18
- vpmadd52huq %zmm1,%zmm10,%zmm19
- vpmadd52luq %zmm1,%zmm6,%zmm20
- vpmadd52huq %zmm1,%zmm6,%zmm21
- vpmadd52luq %zmm1,%zmm7,%zmm22
- vpmadd52huq %zmm1,%zmm7,%zmm23
-
-
-
-
- movl $1,%eax
- kmovw %eax,%k1
- vpsrldq $8,%zmm18,%zmm24
- vpsrldq $8,%zmm19,%zmm0
- vpsrldq $8,%zmm20,%zmm25
- vpsrldq $8,%zmm21,%zmm1
- vpaddq %zmm24,%zmm18,%zmm18
- vpaddq %zmm0,%zmm19,%zmm19
- vpsrldq $8,%zmm22,%zmm26
- vpsrldq $8,%zmm23,%zmm2
- vpaddq %zmm25,%zmm20,%zmm20
- vpaddq %zmm1,%zmm21,%zmm21
- vpermq $0x2,%zmm18,%zmm24
- vpermq $0x2,%zmm19,%zmm0
- vpaddq %zmm26,%zmm22,%zmm22
- vpaddq %zmm2,%zmm23,%zmm23
-
- vpermq $0x2,%zmm20,%zmm25
- vpermq $0x2,%zmm21,%zmm1
- vpaddq %zmm24,%zmm18,%zmm18
- vpaddq %zmm0,%zmm19,%zmm19
- vpermq $0x2,%zmm22,%zmm26
- vpermq $0x2,%zmm23,%zmm2
- vpaddq %zmm25,%zmm20,%zmm20
- vpaddq %zmm1,%zmm21,%zmm21
- vextracti64x4 $1,%zmm18,%ymm24
- vextracti64x4 $1,%zmm19,%ymm0
- vpaddq %zmm26,%zmm22,%zmm22
- vpaddq %zmm2,%zmm23,%zmm23
-
- vextracti64x4 $1,%zmm20,%ymm25
- vextracti64x4 $1,%zmm21,%ymm1
- vextracti64x4 $1,%zmm22,%ymm26
- vextracti64x4 $1,%zmm23,%ymm2
- vpaddq %ymm24,%ymm18,%ymm18{%k1}{z}
- vpaddq %ymm0,%ymm19,%ymm19{%k1}{z}
- vpaddq %ymm25,%ymm20,%ymm20{%k1}{z}
- vpaddq %ymm1,%ymm21,%ymm21{%k1}{z}
- vpaddq %ymm26,%ymm22,%ymm22{%k1}{z}
- vpaddq %ymm2,%ymm23,%ymm23{%k1}{z}
-
-
-
- vpsrlq $44,%ymm18,%ymm30
- vpsllq $8,%ymm19,%ymm19
- vpandq %ymm28,%ymm18,%ymm0
- vpaddq %ymm30,%ymm19,%ymm19
-
- vpaddq %ymm19,%ymm20,%ymm20
-
- vpsrlq $44,%ymm20,%ymm30
- vpsllq $8,%ymm21,%ymm21
- vpandq %ymm28,%ymm20,%ymm1
- vpaddq %ymm30,%ymm21,%ymm21
-
- vpaddq %ymm21,%ymm22,%ymm22
-
- vpsrlq $42,%ymm22,%ymm30
- vpsllq $10,%ymm23,%ymm23
- vpandq %ymm29,%ymm22,%ymm2
- vpaddq %ymm30,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm0,%ymm0
- vpsllq $2,%ymm23,%ymm23
-
- vpaddq %ymm23,%ymm0,%ymm0
-
- vpsrlq $44,%ymm0,%ymm30
- vpandq %ymm28,%ymm0,%ymm0
-
- vpaddq %ymm30,%ymm1,%ymm1
-
-
-
- vmovq %xmm0,0(%rdi)
- vmovq %xmm1,8(%rdi)
- vmovq %xmm2,16(%rdi)
- vzeroall
-
-.Lno_data_vpmadd52_8x:
- .byte 0xf3,0xc3
-.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
-.type poly1305_emit_base2_44,@function
-.align 32
-poly1305_emit_base2_44:
- movq 0(%rdi),%r8
- movq 8(%rdi),%r9
- movq 16(%rdi),%r10
-
- movq %r9,%rax
- shrq $20,%r9
- shlq $44,%rax
- movq %r10,%rcx
- shrq $40,%r10
- shlq $24,%rcx
-
- addq %rax,%r8
- adcq %rcx,%r9
- adcq $0,%r10
-
- movq %r8,%rax
- addq $5,%r8
- movq %r9,%rcx
- adcq $0,%r9
- adcq $0,%r10
- shrq $2,%r10
- cmovnzq %r8,%rax
- cmovnzq %r9,%rcx
-
- addq 0(%rdx),%rax
- adcq 8(%rdx),%rcx
- movq %rax,0(%rsi)
- movq %rcx,8(%rsi)
-
- .byte 0xf3,0xc3
-.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
-.align 64
-.Lconst:
-.Lmask24:
-.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
-.L129:
-.long 16777216,0,16777216,0,16777216,0,16777216,0
-.Lmask26:
-.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lpermd_avx2:
-.long 2,2,2,3,2,0,2,1
-.Lpermd_avx512:
-.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+ leaq -8(%r10),%rsp
-.L2_44_inp_permd:
-.long 0,1,1,2,2,3,7,7
-.L2_44_inp_shift:
-.quad 0,12,24,64
-.L2_44_mask:
-.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
-.L2_44_shift_rgt:
-.quad 44,44,42,64
-.L2_44_shift_lft:
-.quad 8,8,10,64
+ ret
-.align 64
-.Lx_mask44:
-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.Lx_mask42:
-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
-.align 16
-.globl xor128_encrypt_n_pad
-.type xor128_encrypt_n_pad,@function
-.align 16
-xor128_encrypt_n_pad:
- subq %rdx,%rsi
- subq %rdx,%rdi
- movq %rcx,%r10
- shrq $4,%rcx
- jz .Ltail_enc
- nop
-.Loop_enc_xmm:
- movdqu (%rsi,%rdx,1),%xmm0
- pxor (%rdx),%xmm0
- movdqu %xmm0,(%rdi,%rdx,1)
- movdqa %xmm0,(%rdx)
- leaq 16(%rdx),%rdx
- decq %rcx
- jnz .Loop_enc_xmm
-
- andq $15,%r10
- jz .Ldone_enc
-
-.Ltail_enc:
- movq $16,%rcx
- subq %r10,%rcx
- xorl %eax,%eax
-.Loop_enc_byte:
- movb (%rsi,%rdx,1),%al
- xorb (%rdx),%al
- movb %al,(%rdi,%rdx,1)
- movb %al,(%rdx)
- leaq 1(%rdx),%rdx
- decq %r10
- jnz .Loop_enc_byte
-
- xorl %eax,%eax
-.Loop_enc_pad:
- movb %al,(%rdx)
- leaq 1(%rdx),%rdx
- decq %rcx
- jnz .Loop_enc_pad
-
-.Ldone_enc:
- movq %rdx,%rax
- .byte 0xf3,0xc3
-.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
-
-.globl xor128_decrypt_n_pad
-.type xor128_decrypt_n_pad,@function
-.align 16
-xor128_decrypt_n_pad:
- subq %rdx,%rsi
- subq %rdx,%rdi
- movq %rcx,%r10
- shrq $4,%rcx
- jz .Ltail_dec
- nop
-.Loop_dec_xmm:
- movdqu (%rsi,%rdx,1),%xmm0
- movdqa (%rdx),%xmm1
- pxor %xmm0,%xmm1
- movdqu %xmm1,(%rdi,%rdx,1)
- movdqa %xmm0,(%rdx)
- leaq 16(%rdx),%rdx
- decq %rcx
- jnz .Loop_dec_xmm
-
- pxor %xmm1,%xmm1
- andq $15,%r10
- jz .Ldone_dec
-
-.Ltail_dec:
- movq $16,%rcx
- subq %r10,%rcx
- xorl %eax,%eax
- xorq %r11,%r11
-.Loop_dec_byte:
- movb (%rsi,%rdx,1),%r11b
- movb (%rdx),%al
- xorb %r11b,%al
- movb %al,(%rdi,%rdx,1)
- movb %r11b,(%rdx)
- leaq 1(%rdx),%rdx
- decq %r10
- jnz .Loop_dec_byte
-
- xorl %eax,%eax
-.Loop_dec_pad:
- movb %al,(%rdx)
- leaq 1(%rdx),%rdx
- decq %rcx
- jnz .Loop_dec_pad
-
-.Ldone_dec:
- movq %rdx,%rax
- .byte 0xf3,0xc3
-.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
+ENDPROC(poly1305_blocks_avx512)
+#endif /* CONFIG_AS_AVX512 */
@@ -16,6 +16,9 @@
#include <linux/module.h>
#include <linux/init.h>
+#if defined(CONFIG_ZINC_ARCH_X86_64)
+#include "poly1305-x86_64-glue.c"
+#else
static inline bool poly1305_init_arch(void *ctx,
const u8 key[POLY1305_KEY_SIZE])
{
@@ -37,6 +40,7 @@ static bool *const poly1305_nobs[] __initconst = { };
static void __init poly1305_fpu_init(void)
{
}
+#endif
#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__)
#include "poly1305-donna64.c"