diff mbox series

[net-next,v7,12/28] zinc: import Andy Polyakov's Poly1305 x86_64 implementation

Message ID 20181006025709.4019-13-Jason@zx2c4.com (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

Jason A. Donenfeld Oct. 6, 2018, 2:56 a.m. UTC
These x86_64 vectorized implementations come from Andy Polyakov's
implementation, and are included here in raw form without modification,
so that subsequent commits that fix these up for the kernel can see how
it has changed.

While this is CRYPTOGAMS code, the originating code for this happens to
be the same as OpenSSL's commit 4dfe4310c31c4483705991d9a798ce9be1ed1c68

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Based-on-code-from: Andy Polyakov <appro@openssl.org>
Cc: Andy Polyakov <appro@openssl.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: x86@kernel.org
Cc: Samuel Neves <sneves@dei.uc.pt>
Cc: Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: kernel-hardening@lists.openwall.com
Cc: linux-crypto@vger.kernel.org
---
 .../poly1305/poly1305-x86_64-cryptogams.S     | 3565 +++++++++++++++++
 1 file changed, 3565 insertions(+)
 create mode 100644 lib/zinc/poly1305/poly1305-x86_64-cryptogams.S
diff mbox series

Patch

diff --git a/lib/zinc/poly1305/poly1305-x86_64-cryptogams.S b/lib/zinc/poly1305/poly1305-x86_64-cryptogams.S
new file mode 100644
index 000000000000..ed634757354b
--- /dev/null
+++ b/lib/zinc/poly1305/poly1305-x86_64-cryptogams.S
@@ -0,0 +1,3565 @@ 
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/*
+ * Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+ */
+
+.text	
+
+
+
+.globl	poly1305_init
+.hidden	poly1305_init
+.globl	poly1305_blocks
+.hidden	poly1305_blocks
+.globl	poly1305_emit
+.hidden	poly1305_emit
+
+.type	poly1305_init,@function
+.align	32
+poly1305_init:
+	xorq	%rax,%rax
+	movq	%rax,0(%rdi)
+	movq	%rax,8(%rdi)
+	movq	%rax,16(%rdi)
+
+	cmpq	$0,%rsi
+	je	.Lno_key
+
+	leaq	poly1305_blocks(%rip),%r10
+	leaq	poly1305_emit(%rip),%r11
+	movq	OPENSSL_ia32cap_P+4(%rip),%r9
+	leaq	poly1305_blocks_avx(%rip),%rax
+	leaq	poly1305_emit_avx(%rip),%rcx
+	btq	$28,%r9
+	cmovcq	%rax,%r10
+	cmovcq	%rcx,%r11
+	leaq	poly1305_blocks_avx2(%rip),%rax
+	btq	$37,%r9
+	cmovcq	%rax,%r10
+	movq	$2149646336,%rax
+	shrq	$32,%r9
+	andq	%rax,%r9
+	cmpq	%rax,%r9
+	je	.Linit_base2_44
+	movq	$0x0ffffffc0fffffff,%rax
+	movq	$0x0ffffffc0ffffffc,%rcx
+	andq	0(%rsi),%rax
+	andq	8(%rsi),%rcx
+	movq	%rax,24(%rdi)
+	movq	%rcx,32(%rdi)
+	movq	%r10,0(%rdx)
+	movq	%r11,8(%rdx)
+	movl	$1,%eax
+.Lno_key:
+	.byte	0xf3,0xc3
+.size	poly1305_init,.-poly1305_init
+
+.type	poly1305_blocks,@function
+.align	32
+poly1305_blocks:
+.cfi_startproc	
+.Lblocks:
+	shrq	$4,%rdx
+	jz	.Lno_data
+
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lblocks_body:
+
+	movq	%rdx,%r15
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+	movq	0(%rdi),%r14
+	movq	8(%rdi),%rbx
+	movq	16(%rdi),%rbp
+
+	movq	%r13,%r12
+	shrq	$2,%r13
+	movq	%r12,%rax
+	addq	%r12,%r13
+	jmp	.Loop
+
+.align	32
+.Loop:
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%rbp
+	mulq	%r14
+	movq	%rax,%r9
+	movq	%r11,%rax
+	movq	%rdx,%r10
+
+	mulq	%r14
+	movq	%rax,%r14
+	movq	%r11,%rax
+	movq	%rdx,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	%rdx,%r10
+
+	mulq	%rbx
+	movq	%rbp,%rbx
+	addq	%rax,%r14
+	adcq	%rdx,%r8
+
+	imulq	%r13,%rbx
+	addq	%rbx,%r9
+	movq	%r8,%rbx
+	adcq	$0,%r10
+
+	imulq	%r11,%rbp
+	addq	%r9,%rbx
+	movq	$-4,%rax
+	adcq	%rbp,%r10
+
+	andq	%r10,%rax
+	movq	%r10,%rbp
+	shrq	$2,%r10
+	andq	$3,%rbp
+	addq	%r10,%rax
+	addq	%rax,%r14
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+	movq	%r12,%rax
+	decq	%r15
+	jnz	.Loop
+
+	movq	%r14,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rbp,16(%rdi)
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbp
+.cfi_restore	%rbp
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lno_data:
+.Lblocks_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	poly1305_blocks,.-poly1305_blocks
+
+.type	poly1305_emit,@function
+.align	32
+poly1305_emit:
+.Lemit:
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+
+	movq	%r8,%rax
+	addq	$5,%r8
+	movq	%r9,%rcx
+	adcq	$0,%r9
+	adcq	$0,%r10
+	shrq	$2,%r10
+	cmovnzq	%r8,%rax
+	cmovnzq	%r9,%rcx
+
+	addq	0(%rdx),%rax
+	adcq	8(%rdx),%rcx
+	movq	%rax,0(%rsi)
+	movq	%rcx,8(%rsi)
+
+	.byte	0xf3,0xc3
+.size	poly1305_emit,.-poly1305_emit
+.type	__poly1305_block,@function
+.align	32
+__poly1305_block:
+	mulq	%r14
+	movq	%rax,%r9
+	movq	%r11,%rax
+	movq	%rdx,%r10
+
+	mulq	%r14
+	movq	%rax,%r14
+	movq	%r11,%rax
+	movq	%rdx,%r8
+
+	mulq	%rbx
+	addq	%rax,%r9
+	movq	%r13,%rax
+	adcq	%rdx,%r10
+
+	mulq	%rbx
+	movq	%rbp,%rbx
+	addq	%rax,%r14
+	adcq	%rdx,%r8
+
+	imulq	%r13,%rbx
+	addq	%rbx,%r9
+	movq	%r8,%rbx
+	adcq	$0,%r10
+
+	imulq	%r11,%rbp
+	addq	%r9,%rbx
+	movq	$-4,%rax
+	adcq	%rbp,%r10
+
+	andq	%r10,%rax
+	movq	%r10,%rbp
+	shrq	$2,%r10
+	andq	$3,%rbp
+	addq	%r10,%rax
+	addq	%rax,%r14
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+	.byte	0xf3,0xc3
+.size	__poly1305_block,.-__poly1305_block
+
+.type	__poly1305_init_avx,@function
+.align	32
+__poly1305_init_avx:
+	movq	%r11,%r14
+	movq	%r12,%rbx
+	xorq	%rbp,%rbp
+
+	leaq	48+64(%rdi),%rdi
+
+	movq	%r12,%rax
+	call	__poly1305_block
+
+	movl	$0x3ffffff,%eax
+	movl	$0x3ffffff,%edx
+	movq	%r14,%r8
+	andl	%r14d,%eax
+	movq	%r11,%r9
+	andl	%r11d,%edx
+	movl	%eax,-64(%rdi)
+	shrq	$26,%r8
+	movl	%edx,-60(%rdi)
+	shrq	$26,%r9
+
+	movl	$0x3ffffff,%eax
+	movl	$0x3ffffff,%edx
+	andl	%r8d,%eax
+	andl	%r9d,%edx
+	movl	%eax,-48(%rdi)
+	leal	(%rax,%rax,4),%eax
+	movl	%edx,-44(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	movl	%eax,-32(%rdi)
+	shrq	$26,%r8
+	movl	%edx,-28(%rdi)
+	shrq	$26,%r9
+
+	movq	%rbx,%rax
+	movq	%r12,%rdx
+	shlq	$12,%rax
+	shlq	$12,%rdx
+	orq	%r8,%rax
+	orq	%r9,%rdx
+	andl	$0x3ffffff,%eax
+	andl	$0x3ffffff,%edx
+	movl	%eax,-16(%rdi)
+	leal	(%rax,%rax,4),%eax
+	movl	%edx,-12(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	movl	%eax,0(%rdi)
+	movq	%rbx,%r8
+	movl	%edx,4(%rdi)
+	movq	%r12,%r9
+
+	movl	$0x3ffffff,%eax
+	movl	$0x3ffffff,%edx
+	shrq	$14,%r8
+	shrq	$14,%r9
+	andl	%r8d,%eax
+	andl	%r9d,%edx
+	movl	%eax,16(%rdi)
+	leal	(%rax,%rax,4),%eax
+	movl	%edx,20(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	movl	%eax,32(%rdi)
+	shrq	$26,%r8
+	movl	%edx,36(%rdi)
+	shrq	$26,%r9
+
+	movq	%rbp,%rax
+	shlq	$24,%rax
+	orq	%rax,%r8
+	movl	%r8d,48(%rdi)
+	leaq	(%r8,%r8,4),%r8
+	movl	%r9d,52(%rdi)
+	leaq	(%r9,%r9,4),%r9
+	movl	%r8d,64(%rdi)
+	movl	%r9d,68(%rdi)
+
+	movq	%r12,%rax
+	call	__poly1305_block
+
+	movl	$0x3ffffff,%eax
+	movq	%r14,%r8
+	andl	%r14d,%eax
+	shrq	$26,%r8
+	movl	%eax,-52(%rdi)
+
+	movl	$0x3ffffff,%edx
+	andl	%r8d,%edx
+	movl	%edx,-36(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	shrq	$26,%r8
+	movl	%edx,-20(%rdi)
+
+	movq	%rbx,%rax
+	shlq	$12,%rax
+	orq	%r8,%rax
+	andl	$0x3ffffff,%eax
+	movl	%eax,-4(%rdi)
+	leal	(%rax,%rax,4),%eax
+	movq	%rbx,%r8
+	movl	%eax,12(%rdi)
+
+	movl	$0x3ffffff,%edx
+	shrq	$14,%r8
+	andl	%r8d,%edx
+	movl	%edx,28(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	shrq	$26,%r8
+	movl	%edx,44(%rdi)
+
+	movq	%rbp,%rax
+	shlq	$24,%rax
+	orq	%rax,%r8
+	movl	%r8d,60(%rdi)
+	leaq	(%r8,%r8,4),%r8
+	movl	%r8d,76(%rdi)
+
+	movq	%r12,%rax
+	call	__poly1305_block
+
+	movl	$0x3ffffff,%eax
+	movq	%r14,%r8
+	andl	%r14d,%eax
+	shrq	$26,%r8
+	movl	%eax,-56(%rdi)
+
+	movl	$0x3ffffff,%edx
+	andl	%r8d,%edx
+	movl	%edx,-40(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	shrq	$26,%r8
+	movl	%edx,-24(%rdi)
+
+	movq	%rbx,%rax
+	shlq	$12,%rax
+	orq	%r8,%rax
+	andl	$0x3ffffff,%eax
+	movl	%eax,-8(%rdi)
+	leal	(%rax,%rax,4),%eax
+	movq	%rbx,%r8
+	movl	%eax,8(%rdi)
+
+	movl	$0x3ffffff,%edx
+	shrq	$14,%r8
+	andl	%r8d,%edx
+	movl	%edx,24(%rdi)
+	leal	(%rdx,%rdx,4),%edx
+	shrq	$26,%r8
+	movl	%edx,40(%rdi)
+
+	movq	%rbp,%rax
+	shlq	$24,%rax
+	orq	%rax,%r8
+	movl	%r8d,56(%rdi)
+	leaq	(%r8,%r8,4),%r8
+	movl	%r8d,72(%rdi)
+
+	leaq	-48-64(%rdi),%rdi
+	.byte	0xf3,0xc3
+.size	__poly1305_init_avx,.-__poly1305_init_avx
+
+.type	poly1305_blocks_avx,@function
+.align	32
+poly1305_blocks_avx:
+.cfi_startproc	
+	movl	20(%rdi),%r8d
+	cmpq	$128,%rdx
+	jae	.Lblocks_avx
+	testl	%r8d,%r8d
+	jz	.Lblocks
+
+.Lblocks_avx:
+	andq	$-16,%rdx
+	jz	.Lno_data_avx
+
+	vzeroupper
+
+	testl	%r8d,%r8d
+	jz	.Lbase2_64_avx
+
+	testq	$31,%rdx
+	jz	.Leven_avx
+
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lblocks_avx_body:
+
+	movq	%rdx,%r15
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movl	16(%rdi),%ebp
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+
+	movl	%r8d,%r14d
+	andq	$-2147483648,%r8
+	movq	%r9,%r12
+	movl	%r9d,%ebx
+	andq	$-2147483648,%r9
+
+	shrq	$6,%r8
+	shlq	$52,%r12
+	addq	%r8,%r14
+	shrq	$12,%rbx
+	shrq	$18,%r9
+	addq	%r12,%r14
+	adcq	%r9,%rbx
+
+	movq	%rbp,%r8
+	shlq	$40,%r8
+	shrq	$24,%rbp
+	addq	%r8,%rbx
+	adcq	$0,%rbp
+
+	movq	$-4,%r9
+	movq	%rbp,%r8
+	andq	%rbp,%r9
+	shrq	$2,%r8
+	andq	$3,%rbp
+	addq	%r9,%r8
+	addq	%r8,%r14
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+
+	movq	%r13,%r12
+	movq	%r13,%rax
+	shrq	$2,%r13
+	addq	%r12,%r13
+
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%rbp
+
+	call	__poly1305_block
+
+	testq	%rcx,%rcx
+	jz	.Lstore_base2_64_avx
+
+
+	movq	%r14,%rax
+	movq	%r14,%rdx
+	shrq	$52,%r14
+	movq	%rbx,%r11
+	movq	%rbx,%r12
+	shrq	$26,%rdx
+	andq	$0x3ffffff,%rax
+	shlq	$12,%r11
+	andq	$0x3ffffff,%rdx
+	shrq	$14,%rbx
+	orq	%r11,%r14
+	shlq	$24,%rbp
+	andq	$0x3ffffff,%r14
+	shrq	$40,%r12
+	andq	$0x3ffffff,%rbx
+	orq	%r12,%rbp
+
+	subq	$16,%r15
+	jz	.Lstore_base2_26_avx
+
+	vmovd	%eax,%xmm0
+	vmovd	%edx,%xmm1
+	vmovd	%r14d,%xmm2
+	vmovd	%ebx,%xmm3
+	vmovd	%ebp,%xmm4
+	jmp	.Lproceed_avx
+
+.align	32
+.Lstore_base2_64_avx:
+	movq	%r14,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rbp,16(%rdi)
+	jmp	.Ldone_avx
+
+.align	16
+.Lstore_base2_26_avx:
+	movl	%eax,0(%rdi)
+	movl	%edx,4(%rdi)
+	movl	%r14d,8(%rdi)
+	movl	%ebx,12(%rdi)
+	movl	%ebp,16(%rdi)
+.align	16
+.Ldone_avx:
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbp
+.cfi_restore	%rbp
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.align	32
+.Lbase2_64_avx:
+.cfi_startproc	
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lbase2_64_avx_body:
+
+	movq	%rdx,%r15
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+	movq	0(%rdi),%r14
+	movq	8(%rdi),%rbx
+	movl	16(%rdi),%ebp
+
+	movq	%r13,%r12
+	movq	%r13,%rax
+	shrq	$2,%r13
+	addq	%r12,%r13
+
+	testq	$31,%rdx
+	jz	.Linit_avx
+
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%rbp
+	subq	$16,%r15
+
+	call	__poly1305_block
+
+.Linit_avx:
+
+	movq	%r14,%rax
+	movq	%r14,%rdx
+	shrq	$52,%r14
+	movq	%rbx,%r8
+	movq	%rbx,%r9
+	shrq	$26,%rdx
+	andq	$0x3ffffff,%rax
+	shlq	$12,%r8
+	andq	$0x3ffffff,%rdx
+	shrq	$14,%rbx
+	orq	%r8,%r14
+	shlq	$24,%rbp
+	andq	$0x3ffffff,%r14
+	shrq	$40,%r9
+	andq	$0x3ffffff,%rbx
+	orq	%r9,%rbp
+
+	vmovd	%eax,%xmm0
+	vmovd	%edx,%xmm1
+	vmovd	%r14d,%xmm2
+	vmovd	%ebx,%xmm3
+	vmovd	%ebp,%xmm4
+	movl	$1,20(%rdi)
+
+	call	__poly1305_init_avx
+
+.Lproceed_avx:
+	movq	%r15,%rdx
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbp
+.cfi_restore	%rbp
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	leaq	48(%rsp),%rax
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lbase2_64_avx_epilogue:
+	jmp	.Ldo_avx
+.cfi_endproc	
+
+.align	32
+.Leven_avx:
+.cfi_startproc	
+	vmovd	0(%rdi),%xmm0
+	vmovd	4(%rdi),%xmm1
+	vmovd	8(%rdi),%xmm2
+	vmovd	12(%rdi),%xmm3
+	vmovd	16(%rdi),%xmm4
+
+.Ldo_avx:
+	leaq	-88(%rsp),%r11
+.cfi_def_cfa	%r11,0x60
+	subq	$0x178,%rsp
+	subq	$64,%rdx
+	leaq	-32(%rsi),%rax
+	cmovcq	%rax,%rsi
+
+	vmovdqu	48(%rdi),%xmm14
+	leaq	112(%rdi),%rdi
+	leaq	.Lconst(%rip),%rcx
+
+
+
+	vmovdqu	32(%rsi),%xmm5
+	vmovdqu	48(%rsi),%xmm6
+	vmovdqa	64(%rcx),%xmm15
+
+	vpsrldq	$6,%xmm5,%xmm7
+	vpsrldq	$6,%xmm6,%xmm8
+	vpunpckhqdq	%xmm6,%xmm5,%xmm9
+	vpunpcklqdq	%xmm6,%xmm5,%xmm5
+	vpunpcklqdq	%xmm8,%xmm7,%xmm8
+
+	vpsrlq	$40,%xmm9,%xmm9
+	vpsrlq	$26,%xmm5,%xmm6
+	vpand	%xmm15,%xmm5,%xmm5
+	vpsrlq	$4,%xmm8,%xmm7
+	vpand	%xmm15,%xmm6,%xmm6
+	vpsrlq	$30,%xmm8,%xmm8
+	vpand	%xmm15,%xmm7,%xmm7
+	vpand	%xmm15,%xmm8,%xmm8
+	vpor	32(%rcx),%xmm9,%xmm9
+
+	jbe	.Lskip_loop_avx
+
+
+	vmovdqu	-48(%rdi),%xmm11
+	vmovdqu	-32(%rdi),%xmm12
+	vpshufd	$0xEE,%xmm14,%xmm13
+	vpshufd	$0x44,%xmm14,%xmm10
+	vmovdqa	%xmm13,-144(%r11)
+	vmovdqa	%xmm10,0(%rsp)
+	vpshufd	$0xEE,%xmm11,%xmm14
+	vmovdqu	-16(%rdi),%xmm10
+	vpshufd	$0x44,%xmm11,%xmm11
+	vmovdqa	%xmm14,-128(%r11)
+	vmovdqa	%xmm11,16(%rsp)
+	vpshufd	$0xEE,%xmm12,%xmm13
+	vmovdqu	0(%rdi),%xmm11
+	vpshufd	$0x44,%xmm12,%xmm12
+	vmovdqa	%xmm13,-112(%r11)
+	vmovdqa	%xmm12,32(%rsp)
+	vpshufd	$0xEE,%xmm10,%xmm14
+	vmovdqu	16(%rdi),%xmm12
+	vpshufd	$0x44,%xmm10,%xmm10
+	vmovdqa	%xmm14,-96(%r11)
+	vmovdqa	%xmm10,48(%rsp)
+	vpshufd	$0xEE,%xmm11,%xmm13
+	vmovdqu	32(%rdi),%xmm10
+	vpshufd	$0x44,%xmm11,%xmm11
+	vmovdqa	%xmm13,-80(%r11)
+	vmovdqa	%xmm11,64(%rsp)
+	vpshufd	$0xEE,%xmm12,%xmm14
+	vmovdqu	48(%rdi),%xmm11
+	vpshufd	$0x44,%xmm12,%xmm12
+	vmovdqa	%xmm14,-64(%r11)
+	vmovdqa	%xmm12,80(%rsp)
+	vpshufd	$0xEE,%xmm10,%xmm13
+	vmovdqu	64(%rdi),%xmm12
+	vpshufd	$0x44,%xmm10,%xmm10
+	vmovdqa	%xmm13,-48(%r11)
+	vmovdqa	%xmm10,96(%rsp)
+	vpshufd	$0xEE,%xmm11,%xmm14
+	vpshufd	$0x44,%xmm11,%xmm11
+	vmovdqa	%xmm14,-32(%r11)
+	vmovdqa	%xmm11,112(%rsp)
+	vpshufd	$0xEE,%xmm12,%xmm13
+	vmovdqa	0(%rsp),%xmm14
+	vpshufd	$0x44,%xmm12,%xmm12
+	vmovdqa	%xmm13,-16(%r11)
+	vmovdqa	%xmm12,128(%rsp)
+
+	jmp	.Loop_avx
+
+.align	32
+.Loop_avx:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vpmuludq	%xmm5,%xmm14,%xmm10
+	vpmuludq	%xmm6,%xmm14,%xmm11
+	vmovdqa	%xmm2,32(%r11)
+	vpmuludq	%xmm7,%xmm14,%xmm12
+	vmovdqa	16(%rsp),%xmm2
+	vpmuludq	%xmm8,%xmm14,%xmm13
+	vpmuludq	%xmm9,%xmm14,%xmm14
+
+	vmovdqa	%xmm0,0(%r11)
+	vpmuludq	32(%rsp),%xmm9,%xmm0
+	vmovdqa	%xmm1,16(%r11)
+	vpmuludq	%xmm8,%xmm2,%xmm1
+	vpaddq	%xmm0,%xmm10,%xmm10
+	vpaddq	%xmm1,%xmm14,%xmm14
+	vmovdqa	%xmm3,48(%r11)
+	vpmuludq	%xmm7,%xmm2,%xmm0
+	vpmuludq	%xmm6,%xmm2,%xmm1
+	vpaddq	%xmm0,%xmm13,%xmm13
+	vmovdqa	48(%rsp),%xmm3
+	vpaddq	%xmm1,%xmm12,%xmm12
+	vmovdqa	%xmm4,64(%r11)
+	vpmuludq	%xmm5,%xmm2,%xmm2
+	vpmuludq	%xmm7,%xmm3,%xmm0
+	vpaddq	%xmm2,%xmm11,%xmm11
+
+	vmovdqa	64(%rsp),%xmm4
+	vpaddq	%xmm0,%xmm14,%xmm14
+	vpmuludq	%xmm6,%xmm3,%xmm1
+	vpmuludq	%xmm5,%xmm3,%xmm3
+	vpaddq	%xmm1,%xmm13,%xmm13
+	vmovdqa	80(%rsp),%xmm2
+	vpaddq	%xmm3,%xmm12,%xmm12
+	vpmuludq	%xmm9,%xmm4,%xmm0
+	vpmuludq	%xmm8,%xmm4,%xmm4
+	vpaddq	%xmm0,%xmm11,%xmm11
+	vmovdqa	96(%rsp),%xmm3
+	vpaddq	%xmm4,%xmm10,%xmm10
+
+	vmovdqa	128(%rsp),%xmm4
+	vpmuludq	%xmm6,%xmm2,%xmm1
+	vpmuludq	%xmm5,%xmm2,%xmm2
+	vpaddq	%xmm1,%xmm14,%xmm14
+	vpaddq	%xmm2,%xmm13,%xmm13
+	vpmuludq	%xmm9,%xmm3,%xmm0
+	vpmuludq	%xmm8,%xmm3,%xmm1
+	vpaddq	%xmm0,%xmm12,%xmm12
+	vmovdqu	0(%rsi),%xmm0
+	vpaddq	%xmm1,%xmm11,%xmm11
+	vpmuludq	%xmm7,%xmm3,%xmm3
+	vpmuludq	%xmm7,%xmm4,%xmm7
+	vpaddq	%xmm3,%xmm10,%xmm10
+
+	vmovdqu	16(%rsi),%xmm1
+	vpaddq	%xmm7,%xmm11,%xmm11
+	vpmuludq	%xmm8,%xmm4,%xmm8
+	vpmuludq	%xmm9,%xmm4,%xmm9
+	vpsrldq	$6,%xmm0,%xmm2
+	vpaddq	%xmm8,%xmm12,%xmm12
+	vpaddq	%xmm9,%xmm13,%xmm13
+	vpsrldq	$6,%xmm1,%xmm3
+	vpmuludq	112(%rsp),%xmm5,%xmm9
+	vpmuludq	%xmm6,%xmm4,%xmm5
+	vpunpckhqdq	%xmm1,%xmm0,%xmm4
+	vpaddq	%xmm9,%xmm14,%xmm14
+	vmovdqa	-144(%r11),%xmm9
+	vpaddq	%xmm5,%xmm10,%xmm10
+
+	vpunpcklqdq	%xmm1,%xmm0,%xmm0
+	vpunpcklqdq	%xmm3,%xmm2,%xmm3
+
+
+	vpsrldq	$5,%xmm4,%xmm4
+	vpsrlq	$26,%xmm0,%xmm1
+	vpand	%xmm15,%xmm0,%xmm0
+	vpsrlq	$4,%xmm3,%xmm2
+	vpand	%xmm15,%xmm1,%xmm1
+	vpand	0(%rcx),%xmm4,%xmm4
+	vpsrlq	$30,%xmm3,%xmm3
+	vpand	%xmm15,%xmm2,%xmm2
+	vpand	%xmm15,%xmm3,%xmm3
+	vpor	32(%rcx),%xmm4,%xmm4
+
+	vpaddq	0(%r11),%xmm0,%xmm0
+	vpaddq	16(%r11),%xmm1,%xmm1
+	vpaddq	32(%r11),%xmm2,%xmm2
+	vpaddq	48(%r11),%xmm3,%xmm3
+	vpaddq	64(%r11),%xmm4,%xmm4
+
+	leaq	32(%rsi),%rax
+	leaq	64(%rsi),%rsi
+	subq	$64,%rdx
+	cmovcq	%rax,%rsi
+
+
+
+
+
+
+
+
+
+
+	vpmuludq	%xmm0,%xmm9,%xmm5
+	vpmuludq	%xmm1,%xmm9,%xmm6
+	vpaddq	%xmm5,%xmm10,%xmm10
+	vpaddq	%xmm6,%xmm11,%xmm11
+	vmovdqa	-128(%r11),%xmm7
+	vpmuludq	%xmm2,%xmm9,%xmm5
+	vpmuludq	%xmm3,%xmm9,%xmm6
+	vpaddq	%xmm5,%xmm12,%xmm12
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vpmuludq	%xmm4,%xmm9,%xmm9
+	vpmuludq	-112(%r11),%xmm4,%xmm5
+	vpaddq	%xmm9,%xmm14,%xmm14
+
+	vpaddq	%xmm5,%xmm10,%xmm10
+	vpmuludq	%xmm2,%xmm7,%xmm6
+	vpmuludq	%xmm3,%xmm7,%xmm5
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vmovdqa	-96(%r11),%xmm8
+	vpaddq	%xmm5,%xmm14,%xmm14
+	vpmuludq	%xmm1,%xmm7,%xmm6
+	vpmuludq	%xmm0,%xmm7,%xmm7
+	vpaddq	%xmm6,%xmm12,%xmm12
+	vpaddq	%xmm7,%xmm11,%xmm11
+
+	vmovdqa	-80(%r11),%xmm9
+	vpmuludq	%xmm2,%xmm8,%xmm5
+	vpmuludq	%xmm1,%xmm8,%xmm6
+	vpaddq	%xmm5,%xmm14,%xmm14
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vmovdqa	-64(%r11),%xmm7
+	vpmuludq	%xmm0,%xmm8,%xmm8
+	vpmuludq	%xmm4,%xmm9,%xmm5
+	vpaddq	%xmm8,%xmm12,%xmm12
+	vpaddq	%xmm5,%xmm11,%xmm11
+	vmovdqa	-48(%r11),%xmm8
+	vpmuludq	%xmm3,%xmm9,%xmm9
+	vpmuludq	%xmm1,%xmm7,%xmm6
+	vpaddq	%xmm9,%xmm10,%xmm10
+
+	vmovdqa	-16(%r11),%xmm9
+	vpaddq	%xmm6,%xmm14,%xmm14
+	vpmuludq	%xmm0,%xmm7,%xmm7
+	vpmuludq	%xmm4,%xmm8,%xmm5
+	vpaddq	%xmm7,%xmm13,%xmm13
+	vpaddq	%xmm5,%xmm12,%xmm12
+	vmovdqu	32(%rsi),%xmm5
+	vpmuludq	%xmm3,%xmm8,%xmm7
+	vpmuludq	%xmm2,%xmm8,%xmm8
+	vpaddq	%xmm7,%xmm11,%xmm11
+	vmovdqu	48(%rsi),%xmm6
+	vpaddq	%xmm8,%xmm10,%xmm10
+
+	vpmuludq	%xmm2,%xmm9,%xmm2
+	vpmuludq	%xmm3,%xmm9,%xmm3
+	vpsrldq	$6,%xmm5,%xmm7
+	vpaddq	%xmm2,%xmm11,%xmm11
+	vpmuludq	%xmm4,%xmm9,%xmm4
+	vpsrldq	$6,%xmm6,%xmm8
+	vpaddq	%xmm3,%xmm12,%xmm2
+	vpaddq	%xmm4,%xmm13,%xmm3
+	vpmuludq	-32(%r11),%xmm0,%xmm4
+	vpmuludq	%xmm1,%xmm9,%xmm0
+	vpunpckhqdq	%xmm6,%xmm5,%xmm9
+	vpaddq	%xmm4,%xmm14,%xmm4
+	vpaddq	%xmm0,%xmm10,%xmm0
+
+	vpunpcklqdq	%xmm6,%xmm5,%xmm5
+	vpunpcklqdq	%xmm8,%xmm7,%xmm8
+
+
+	vpsrldq	$5,%xmm9,%xmm9
+	vpsrlq	$26,%xmm5,%xmm6
+	vmovdqa	0(%rsp),%xmm14
+	vpand	%xmm15,%xmm5,%xmm5
+	vpsrlq	$4,%xmm8,%xmm7
+	vpand	%xmm15,%xmm6,%xmm6
+	vpand	0(%rcx),%xmm9,%xmm9
+	vpsrlq	$30,%xmm8,%xmm8
+	vpand	%xmm15,%xmm7,%xmm7
+	vpand	%xmm15,%xmm8,%xmm8
+	vpor	32(%rcx),%xmm9,%xmm9
+
+
+
+
+
+	vpsrlq	$26,%xmm3,%xmm13
+	vpand	%xmm15,%xmm3,%xmm3
+	vpaddq	%xmm13,%xmm4,%xmm4
+
+	vpsrlq	$26,%xmm0,%xmm10
+	vpand	%xmm15,%xmm0,%xmm0
+	vpaddq	%xmm10,%xmm11,%xmm1
+
+	vpsrlq	$26,%xmm4,%xmm10
+	vpand	%xmm15,%xmm4,%xmm4
+
+	vpsrlq	$26,%xmm1,%xmm11
+	vpand	%xmm15,%xmm1,%xmm1
+	vpaddq	%xmm11,%xmm2,%xmm2
+
+	vpaddq	%xmm10,%xmm0,%xmm0
+	vpsllq	$2,%xmm10,%xmm10
+	vpaddq	%xmm10,%xmm0,%xmm0
+
+	vpsrlq	$26,%xmm2,%xmm12
+	vpand	%xmm15,%xmm2,%xmm2
+	vpaddq	%xmm12,%xmm3,%xmm3
+
+	vpsrlq	$26,%xmm0,%xmm10
+	vpand	%xmm15,%xmm0,%xmm0
+	vpaddq	%xmm10,%xmm1,%xmm1
+
+	vpsrlq	$26,%xmm3,%xmm13
+	vpand	%xmm15,%xmm3,%xmm3
+	vpaddq	%xmm13,%xmm4,%xmm4
+
+	ja	.Loop_avx
+
+.Lskip_loop_avx:
+
+
+
+	vpshufd	$0x10,%xmm14,%xmm14
+	addq	$32,%rdx
+	jnz	.Long_tail_avx
+
+	vpaddq	%xmm2,%xmm7,%xmm7
+	vpaddq	%xmm0,%xmm5,%xmm5
+	vpaddq	%xmm1,%xmm6,%xmm6
+	vpaddq	%xmm3,%xmm8,%xmm8
+	vpaddq	%xmm4,%xmm9,%xmm9
+
+.Long_tail_avx:
+	vmovdqa	%xmm2,32(%r11)
+	vmovdqa	%xmm0,0(%r11)
+	vmovdqa	%xmm1,16(%r11)
+	vmovdqa	%xmm3,48(%r11)
+	vmovdqa	%xmm4,64(%r11)
+
+
+
+
+
+
+
+	vpmuludq	%xmm7,%xmm14,%xmm12
+	vpmuludq	%xmm5,%xmm14,%xmm10
+	vpshufd	$0x10,-48(%rdi),%xmm2
+	vpmuludq	%xmm6,%xmm14,%xmm11
+	vpmuludq	%xmm8,%xmm14,%xmm13
+	vpmuludq	%xmm9,%xmm14,%xmm14
+
+	vpmuludq	%xmm8,%xmm2,%xmm0
+	vpaddq	%xmm0,%xmm14,%xmm14
+	vpshufd	$0x10,-32(%rdi),%xmm3
+	vpmuludq	%xmm7,%xmm2,%xmm1
+	vpaddq	%xmm1,%xmm13,%xmm13
+	vpshufd	$0x10,-16(%rdi),%xmm4
+	vpmuludq	%xmm6,%xmm2,%xmm0
+	vpaddq	%xmm0,%xmm12,%xmm12
+	vpmuludq	%xmm5,%xmm2,%xmm2
+	vpaddq	%xmm2,%xmm11,%xmm11
+	vpmuludq	%xmm9,%xmm3,%xmm3
+	vpaddq	%xmm3,%xmm10,%xmm10
+
+	vpshufd	$0x10,0(%rdi),%xmm2
+	vpmuludq	%xmm7,%xmm4,%xmm1
+	vpaddq	%xmm1,%xmm14,%xmm14
+	vpmuludq	%xmm6,%xmm4,%xmm0
+	vpaddq	%xmm0,%xmm13,%xmm13
+	vpshufd	$0x10,16(%rdi),%xmm3
+	vpmuludq	%xmm5,%xmm4,%xmm4
+	vpaddq	%xmm4,%xmm12,%xmm12
+	vpmuludq	%xmm9,%xmm2,%xmm1
+	vpaddq	%xmm1,%xmm11,%xmm11
+	vpshufd	$0x10,32(%rdi),%xmm4
+	vpmuludq	%xmm8,%xmm2,%xmm2
+	vpaddq	%xmm2,%xmm10,%xmm10
+
+	vpmuludq	%xmm6,%xmm3,%xmm0
+	vpaddq	%xmm0,%xmm14,%xmm14
+	vpmuludq	%xmm5,%xmm3,%xmm3
+	vpaddq	%xmm3,%xmm13,%xmm13
+	vpshufd	$0x10,48(%rdi),%xmm2
+	vpmuludq	%xmm9,%xmm4,%xmm1
+	vpaddq	%xmm1,%xmm12,%xmm12
+	vpshufd	$0x10,64(%rdi),%xmm3
+	vpmuludq	%xmm8,%xmm4,%xmm0
+	vpaddq	%xmm0,%xmm11,%xmm11
+	vpmuludq	%xmm7,%xmm4,%xmm4
+	vpaddq	%xmm4,%xmm10,%xmm10
+
+	vpmuludq	%xmm5,%xmm2,%xmm2
+	vpaddq	%xmm2,%xmm14,%xmm14
+	vpmuludq	%xmm9,%xmm3,%xmm1
+	vpaddq	%xmm1,%xmm13,%xmm13
+	vpmuludq	%xmm8,%xmm3,%xmm0
+	vpaddq	%xmm0,%xmm12,%xmm12
+	vpmuludq	%xmm7,%xmm3,%xmm1
+	vpaddq	%xmm1,%xmm11,%xmm11
+	vpmuludq	%xmm6,%xmm3,%xmm3
+	vpaddq	%xmm3,%xmm10,%xmm10
+
+	jz	.Lshort_tail_avx
+
+	vmovdqu	0(%rsi),%xmm0
+	vmovdqu	16(%rsi),%xmm1
+
+	vpsrldq	$6,%xmm0,%xmm2
+	vpsrldq	$6,%xmm1,%xmm3
+	vpunpckhqdq	%xmm1,%xmm0,%xmm4
+	vpunpcklqdq	%xmm1,%xmm0,%xmm0
+	vpunpcklqdq	%xmm3,%xmm2,%xmm3
+
+	vpsrlq	$40,%xmm4,%xmm4
+	vpsrlq	$26,%xmm0,%xmm1
+	vpand	%xmm15,%xmm0,%xmm0
+	vpsrlq	$4,%xmm3,%xmm2
+	vpand	%xmm15,%xmm1,%xmm1
+	vpsrlq	$30,%xmm3,%xmm3
+	vpand	%xmm15,%xmm2,%xmm2
+	vpand	%xmm15,%xmm3,%xmm3
+	vpor	32(%rcx),%xmm4,%xmm4
+
+	vpshufd	$0x32,-64(%rdi),%xmm9
+	vpaddq	0(%r11),%xmm0,%xmm0
+	vpaddq	16(%r11),%xmm1,%xmm1
+	vpaddq	32(%r11),%xmm2,%xmm2
+	vpaddq	48(%r11),%xmm3,%xmm3
+	vpaddq	64(%r11),%xmm4,%xmm4
+
+
+
+
+	vpmuludq	%xmm0,%xmm9,%xmm5
+	vpaddq	%xmm5,%xmm10,%xmm10
+	vpmuludq	%xmm1,%xmm9,%xmm6
+	vpaddq	%xmm6,%xmm11,%xmm11
+	vpmuludq	%xmm2,%xmm9,%xmm5
+	vpaddq	%xmm5,%xmm12,%xmm12
+	vpshufd	$0x32,-48(%rdi),%xmm7
+	vpmuludq	%xmm3,%xmm9,%xmm6
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vpmuludq	%xmm4,%xmm9,%xmm9
+	vpaddq	%xmm9,%xmm14,%xmm14
+
+	vpmuludq	%xmm3,%xmm7,%xmm5
+	vpaddq	%xmm5,%xmm14,%xmm14
+	vpshufd	$0x32,-32(%rdi),%xmm8
+	vpmuludq	%xmm2,%xmm7,%xmm6
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vpshufd	$0x32,-16(%rdi),%xmm9
+	vpmuludq	%xmm1,%xmm7,%xmm5
+	vpaddq	%xmm5,%xmm12,%xmm12
+	vpmuludq	%xmm0,%xmm7,%xmm7
+	vpaddq	%xmm7,%xmm11,%xmm11
+	vpmuludq	%xmm4,%xmm8,%xmm8
+	vpaddq	%xmm8,%xmm10,%xmm10
+
+	vpshufd	$0x32,0(%rdi),%xmm7
+	vpmuludq	%xmm2,%xmm9,%xmm6
+	vpaddq	%xmm6,%xmm14,%xmm14
+	vpmuludq	%xmm1,%xmm9,%xmm5
+	vpaddq	%xmm5,%xmm13,%xmm13
+	vpshufd	$0x32,16(%rdi),%xmm8
+	vpmuludq	%xmm0,%xmm9,%xmm9
+	vpaddq	%xmm9,%xmm12,%xmm12
+	vpmuludq	%xmm4,%xmm7,%xmm6
+	vpaddq	%xmm6,%xmm11,%xmm11
+	vpshufd	$0x32,32(%rdi),%xmm9
+	vpmuludq	%xmm3,%xmm7,%xmm7
+	vpaddq	%xmm7,%xmm10,%xmm10
+
+	vpmuludq	%xmm1,%xmm8,%xmm5
+	vpaddq	%xmm5,%xmm14,%xmm14
+	vpmuludq	%xmm0,%xmm8,%xmm8
+	vpaddq	%xmm8,%xmm13,%xmm13
+	vpshufd	$0x32,48(%rdi),%xmm7
+	vpmuludq	%xmm4,%xmm9,%xmm6
+	vpaddq	%xmm6,%xmm12,%xmm12
+	vpshufd	$0x32,64(%rdi),%xmm8
+	vpmuludq	%xmm3,%xmm9,%xmm5
+	vpaddq	%xmm5,%xmm11,%xmm11
+	vpmuludq	%xmm2,%xmm9,%xmm9
+	vpaddq	%xmm9,%xmm10,%xmm10
+
+	vpmuludq	%xmm0,%xmm7,%xmm7
+	vpaddq	%xmm7,%xmm14,%xmm14
+	vpmuludq	%xmm4,%xmm8,%xmm6
+	vpaddq	%xmm6,%xmm13,%xmm13
+	vpmuludq	%xmm3,%xmm8,%xmm5
+	vpaddq	%xmm5,%xmm12,%xmm12
+	vpmuludq	%xmm2,%xmm8,%xmm6
+	vpaddq	%xmm6,%xmm11,%xmm11
+	vpmuludq	%xmm1,%xmm8,%xmm8
+	vpaddq	%xmm8,%xmm10,%xmm10
+
+.Lshort_tail_avx:
+
+
+
+	vpsrldq	$8,%xmm14,%xmm9
+	vpsrldq	$8,%xmm13,%xmm8
+	vpsrldq	$8,%xmm11,%xmm6
+	vpsrldq	$8,%xmm10,%xmm5
+	vpsrldq	$8,%xmm12,%xmm7
+	vpaddq	%xmm8,%xmm13,%xmm13
+	vpaddq	%xmm9,%xmm14,%xmm14
+	vpaddq	%xmm5,%xmm10,%xmm10
+	vpaddq	%xmm6,%xmm11,%xmm11
+	vpaddq	%xmm7,%xmm12,%xmm12
+
+
+
+
+	vpsrlq	$26,%xmm13,%xmm3
+	vpand	%xmm15,%xmm13,%xmm13
+	vpaddq	%xmm3,%xmm14,%xmm14
+
+	vpsrlq	$26,%xmm10,%xmm0
+	vpand	%xmm15,%xmm10,%xmm10
+	vpaddq	%xmm0,%xmm11,%xmm11
+
+	vpsrlq	$26,%xmm14,%xmm4
+	vpand	%xmm15,%xmm14,%xmm14
+
+	vpsrlq	$26,%xmm11,%xmm1
+	vpand	%xmm15,%xmm11,%xmm11
+	vpaddq	%xmm1,%xmm12,%xmm12
+
+	vpaddq	%xmm4,%xmm10,%xmm10
+	vpsllq	$2,%xmm4,%xmm4
+	vpaddq	%xmm4,%xmm10,%xmm10
+
+	vpsrlq	$26,%xmm12,%xmm2
+	vpand	%xmm15,%xmm12,%xmm12
+	vpaddq	%xmm2,%xmm13,%xmm13
+
+	vpsrlq	$26,%xmm10,%xmm0
+	vpand	%xmm15,%xmm10,%xmm10
+	vpaddq	%xmm0,%xmm11,%xmm11
+
+	vpsrlq	$26,%xmm13,%xmm3
+	vpand	%xmm15,%xmm13,%xmm13
+	vpaddq	%xmm3,%xmm14,%xmm14
+
+	vmovd	%xmm10,-112(%rdi)
+	vmovd	%xmm11,-108(%rdi)
+	vmovd	%xmm12,-104(%rdi)
+	vmovd	%xmm13,-100(%rdi)
+	vmovd	%xmm14,-96(%rdi)
+	leaq	88(%r11),%rsp
+.cfi_def_cfa	%rsp,8
+	vzeroupper
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	poly1305_blocks_avx,.-poly1305_blocks_avx
+
+.type	poly1305_emit_avx,@function
+.align	32
+poly1305_emit_avx:
+	cmpl	$0,20(%rdi)
+	je	.Lemit
+
+	movl	0(%rdi),%eax
+	movl	4(%rdi),%ecx
+	movl	8(%rdi),%r8d
+	movl	12(%rdi),%r11d
+	movl	16(%rdi),%r10d
+
+	shlq	$26,%rcx
+	movq	%r8,%r9
+	shlq	$52,%r8
+	addq	%rcx,%rax
+	shrq	$12,%r9
+	addq	%rax,%r8
+	adcq	$0,%r9
+
+	shlq	$14,%r11
+	movq	%r10,%rax
+	shrq	$24,%r10
+	addq	%r11,%r9
+	shlq	$40,%rax
+	addq	%rax,%r9
+	adcq	$0,%r10
+
+	movq	%r10,%rax
+	movq	%r10,%rcx
+	andq	$3,%r10
+	shrq	$2,%rax
+	andq	$-4,%rcx
+	addq	%rcx,%rax
+	addq	%rax,%r8
+	adcq	$0,%r9
+	adcq	$0,%r10
+
+	movq	%r8,%rax
+	addq	$5,%r8
+	movq	%r9,%rcx
+	adcq	$0,%r9
+	adcq	$0,%r10
+	shrq	$2,%r10
+	cmovnzq	%r8,%rax
+	cmovnzq	%r9,%rcx
+
+	addq	0(%rdx),%rax
+	adcq	8(%rdx),%rcx
+	movq	%rax,0(%rsi)
+	movq	%rcx,8(%rsi)
+
+	.byte	0xf3,0xc3
+.size	poly1305_emit_avx,.-poly1305_emit_avx
+.type	poly1305_blocks_avx2,@function
+.align	32
+poly1305_blocks_avx2:
+.cfi_startproc	
+	movl	20(%rdi),%r8d
+	cmpq	$128,%rdx
+	jae	.Lblocks_avx2
+	testl	%r8d,%r8d
+	jz	.Lblocks
+
+.Lblocks_avx2:
+	andq	$-16,%rdx
+	jz	.Lno_data_avx2
+
+	vzeroupper
+
+	testl	%r8d,%r8d
+	jz	.Lbase2_64_avx2
+
+	testq	$63,%rdx
+	jz	.Leven_avx2
+
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lblocks_avx2_body:
+
+	movq	%rdx,%r15
+
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movl	16(%rdi),%ebp
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+
+	movl	%r8d,%r14d
+	andq	$-2147483648,%r8
+	movq	%r9,%r12
+	movl	%r9d,%ebx
+	andq	$-2147483648,%r9
+
+	shrq	$6,%r8
+	shlq	$52,%r12
+	addq	%r8,%r14
+	shrq	$12,%rbx
+	shrq	$18,%r9
+	addq	%r12,%r14
+	adcq	%r9,%rbx
+
+	movq	%rbp,%r8
+	shlq	$40,%r8
+	shrq	$24,%rbp
+	addq	%r8,%rbx
+	adcq	$0,%rbp
+
+	movq	$-4,%r9
+	movq	%rbp,%r8
+	andq	%rbp,%r9
+	shrq	$2,%r8
+	andq	$3,%rbp
+	addq	%r9,%r8
+	addq	%r8,%r14
+	adcq	$0,%rbx
+	adcq	$0,%rbp
+
+	movq	%r13,%r12
+	movq	%r13,%rax
+	shrq	$2,%r13
+	addq	%r12,%r13
+
+.Lbase2_26_pre_avx2:
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%rbp
+	subq	$16,%r15
+
+	call	__poly1305_block
+	movq	%r12,%rax
+
+	testq	$63,%r15
+	jnz	.Lbase2_26_pre_avx2
+
+	testq	%rcx,%rcx
+	jz	.Lstore_base2_64_avx2
+
+
+	movq	%r14,%rax
+	movq	%r14,%rdx
+	shrq	$52,%r14
+	movq	%rbx,%r11
+	movq	%rbx,%r12
+	shrq	$26,%rdx
+	andq	$0x3ffffff,%rax
+	shlq	$12,%r11
+	andq	$0x3ffffff,%rdx
+	shrq	$14,%rbx
+	orq	%r11,%r14
+	shlq	$24,%rbp
+	andq	$0x3ffffff,%r14
+	shrq	$40,%r12
+	andq	$0x3ffffff,%rbx
+	orq	%r12,%rbp
+
+	testq	%r15,%r15
+	jz	.Lstore_base2_26_avx2
+
+	vmovd	%eax,%xmm0
+	vmovd	%edx,%xmm1
+	vmovd	%r14d,%xmm2
+	vmovd	%ebx,%xmm3
+	vmovd	%ebp,%xmm4
+	jmp	.Lproceed_avx2
+
+.align	32
+.Lstore_base2_64_avx2:
+	movq	%r14,0(%rdi)
+	movq	%rbx,8(%rdi)
+	movq	%rbp,16(%rdi)
+	jmp	.Ldone_avx2
+
+.align	16
+.Lstore_base2_26_avx2:
+	movl	%eax,0(%rdi)
+	movl	%edx,4(%rdi)
+	movl	%r14d,8(%rdi)
+	movl	%ebx,12(%rdi)
+	movl	%ebp,16(%rdi)
+.align	16
+.Ldone_avx2:
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbp
+.cfi_restore	%rbp
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lno_data_avx2:
+.Lblocks_avx2_epilogue:
+	.byte	0xf3,0xc3
+.cfi_endproc	
+
+.align	32
+.Lbase2_64_avx2:
+.cfi_startproc	
+	pushq	%rbx
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_adjust_cfa_offset	8
+.cfi_offset	%r15,-56
+.Lbase2_64_avx2_body:
+
+	movq	%rdx,%r15
+
+	movq	24(%rdi),%r11
+	movq	32(%rdi),%r13
+
+	movq	0(%rdi),%r14
+	movq	8(%rdi),%rbx
+	movl	16(%rdi),%ebp
+
+	movq	%r13,%r12
+	movq	%r13,%rax
+	shrq	$2,%r13
+	addq	%r12,%r13
+
+	testq	$63,%rdx
+	jz	.Linit_avx2
+
+.Lbase2_64_pre_avx2:
+	addq	0(%rsi),%r14
+	adcq	8(%rsi),%rbx
+	leaq	16(%rsi),%rsi
+	adcq	%rcx,%rbp
+	subq	$16,%r15
+
+	call	__poly1305_block
+	movq	%r12,%rax
+
+	testq	$63,%r15
+	jnz	.Lbase2_64_pre_avx2
+
+.Linit_avx2:
+
+	movq	%r14,%rax
+	movq	%r14,%rdx
+	shrq	$52,%r14
+	movq	%rbx,%r8
+	movq	%rbx,%r9
+	shrq	$26,%rdx
+	andq	$0x3ffffff,%rax
+	shlq	$12,%r8
+	andq	$0x3ffffff,%rdx
+	shrq	$14,%rbx
+	orq	%r8,%r14
+	shlq	$24,%rbp
+	andq	$0x3ffffff,%r14
+	shrq	$40,%r9
+	andq	$0x3ffffff,%rbx
+	orq	%r9,%rbp
+
+	vmovd	%eax,%xmm0
+	vmovd	%edx,%xmm1
+	vmovd	%r14d,%xmm2
+	vmovd	%ebx,%xmm3
+	vmovd	%ebp,%xmm4
+	movl	$1,20(%rdi)
+
+	call	__poly1305_init_avx
+
+.Lproceed_avx2:
+	movq	%r15,%rdx
+	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
+	movl	$3221291008,%r11d
+
+	movq	0(%rsp),%r15
+.cfi_restore	%r15
+	movq	8(%rsp),%r14
+.cfi_restore	%r14
+	movq	16(%rsp),%r13
+.cfi_restore	%r13
+	movq	24(%rsp),%r12
+.cfi_restore	%r12
+	movq	32(%rsp),%rbp
+.cfi_restore	%rbp
+	movq	40(%rsp),%rbx
+.cfi_restore	%rbx
+	leaq	48(%rsp),%rax
+	leaq	48(%rsp),%rsp
+.cfi_adjust_cfa_offset	-48
+.Lbase2_64_avx2_epilogue:
+	jmp	.Ldo_avx2
+.cfi_endproc	
+
+.align	32
+.Leven_avx2:
+.cfi_startproc	
+	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
+	vmovd	0(%rdi),%xmm0
+	vmovd	4(%rdi),%xmm1
+	vmovd	8(%rdi),%xmm2
+	vmovd	12(%rdi),%xmm3
+	vmovd	16(%rdi),%xmm4
+
+.Ldo_avx2:
+	cmpq	$512,%rdx
+	jb	.Lskip_avx512
+	andl	%r11d,%r10d
+	testl	$65536,%r10d
+	jnz	.Lblocks_avx512
+.Lskip_avx512:
+	leaq	-8(%rsp),%r11
+.cfi_def_cfa	%r11,16
+	subq	$0x128,%rsp
+	leaq	.Lconst(%rip),%rcx
+	leaq	48+64(%rdi),%rdi
+	vmovdqa	96(%rcx),%ymm7
+
+
+	vmovdqu	-64(%rdi),%xmm9
+	andq	$-512,%rsp
+	vmovdqu	-48(%rdi),%xmm10
+	vmovdqu	-32(%rdi),%xmm6
+	vmovdqu	-16(%rdi),%xmm11
+	vmovdqu	0(%rdi),%xmm12
+	vmovdqu	16(%rdi),%xmm13
+	leaq	144(%rsp),%rax
+	vmovdqu	32(%rdi),%xmm14
+	vpermd	%ymm9,%ymm7,%ymm9
+	vmovdqu	48(%rdi),%xmm15
+	vpermd	%ymm10,%ymm7,%ymm10
+	vmovdqu	64(%rdi),%xmm5
+	vpermd	%ymm6,%ymm7,%ymm6
+	vmovdqa	%ymm9,0(%rsp)
+	vpermd	%ymm11,%ymm7,%ymm11
+	vmovdqa	%ymm10,32-144(%rax)
+	vpermd	%ymm12,%ymm7,%ymm12
+	vmovdqa	%ymm6,64-144(%rax)
+	vpermd	%ymm13,%ymm7,%ymm13
+	vmovdqa	%ymm11,96-144(%rax)
+	vpermd	%ymm14,%ymm7,%ymm14
+	vmovdqa	%ymm12,128-144(%rax)
+	vpermd	%ymm15,%ymm7,%ymm15
+	vmovdqa	%ymm13,160-144(%rax)
+	vpermd	%ymm5,%ymm7,%ymm5
+	vmovdqa	%ymm14,192-144(%rax)
+	vmovdqa	%ymm15,224-144(%rax)
+	vmovdqa	%ymm5,256-144(%rax)
+	vmovdqa	64(%rcx),%ymm5
+
+
+
+	vmovdqu	0(%rsi),%xmm7
+	vmovdqu	16(%rsi),%xmm8
+	vinserti128	$1,32(%rsi),%ymm7,%ymm7
+	vinserti128	$1,48(%rsi),%ymm8,%ymm8
+	leaq	64(%rsi),%rsi
+
+	vpsrldq	$6,%ymm7,%ymm9
+	vpsrldq	$6,%ymm8,%ymm10
+	vpunpckhqdq	%ymm8,%ymm7,%ymm6
+	vpunpcklqdq	%ymm10,%ymm9,%ymm9
+	vpunpcklqdq	%ymm8,%ymm7,%ymm7
+
+	vpsrlq	$30,%ymm9,%ymm10
+	vpsrlq	$4,%ymm9,%ymm9
+	vpsrlq	$26,%ymm7,%ymm8
+	vpsrlq	$40,%ymm6,%ymm6
+	vpand	%ymm5,%ymm9,%ymm9
+	vpand	%ymm5,%ymm7,%ymm7
+	vpand	%ymm5,%ymm8,%ymm8
+	vpand	%ymm5,%ymm10,%ymm10
+	vpor	32(%rcx),%ymm6,%ymm6
+
+	vpaddq	%ymm2,%ymm9,%ymm2
+	subq	$64,%rdx
+	jz	.Ltail_avx2
+	jmp	.Loop_avx2
+
+.align	32
+.Loop_avx2:
+
+
+
+
+
+
+
+
+	vpaddq	%ymm0,%ymm7,%ymm0
+	vmovdqa	0(%rsp),%ymm7
+	vpaddq	%ymm1,%ymm8,%ymm1
+	vmovdqa	32(%rsp),%ymm8
+	vpaddq	%ymm3,%ymm10,%ymm3
+	vmovdqa	96(%rsp),%ymm9
+	vpaddq	%ymm4,%ymm6,%ymm4
+	vmovdqa	48(%rax),%ymm10
+	vmovdqa	112(%rax),%ymm5
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vpmuludq	%ymm2,%ymm7,%ymm13
+	vpmuludq	%ymm2,%ymm8,%ymm14
+	vpmuludq	%ymm2,%ymm9,%ymm15
+	vpmuludq	%ymm2,%ymm10,%ymm11
+	vpmuludq	%ymm2,%ymm5,%ymm12
+
+	vpmuludq	%ymm0,%ymm8,%ymm6
+	vpmuludq	%ymm1,%ymm8,%ymm2
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	64(%rsp),%ymm4,%ymm2
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm11,%ymm11
+	vmovdqa	-16(%rax),%ymm8
+
+	vpmuludq	%ymm0,%ymm7,%ymm6
+	vpmuludq	%ymm1,%ymm7,%ymm2
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vpmuludq	%ymm3,%ymm7,%ymm6
+	vpmuludq	%ymm4,%ymm7,%ymm2
+	vmovdqu	0(%rsi),%xmm7
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm2,%ymm15,%ymm15
+	vinserti128	$1,32(%rsi),%ymm7,%ymm7
+
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	%ymm4,%ymm8,%ymm2
+	vmovdqu	16(%rsi),%xmm8
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vmovdqa	16(%rax),%ymm2
+	vpmuludq	%ymm1,%ymm9,%ymm6
+	vpmuludq	%ymm0,%ymm9,%ymm9
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm9,%ymm13,%ymm13
+	vinserti128	$1,48(%rsi),%ymm8,%ymm8
+	leaq	64(%rsi),%rsi
+
+	vpmuludq	%ymm1,%ymm2,%ymm6
+	vpmuludq	%ymm0,%ymm2,%ymm2
+	vpsrldq	$6,%ymm7,%ymm9
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm14,%ymm14
+	vpmuludq	%ymm3,%ymm10,%ymm6
+	vpmuludq	%ymm4,%ymm10,%ymm2
+	vpsrldq	$6,%ymm8,%ymm10
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+	vpunpckhqdq	%ymm8,%ymm7,%ymm6
+
+	vpmuludq	%ymm3,%ymm5,%ymm3
+	vpmuludq	%ymm4,%ymm5,%ymm4
+	vpunpcklqdq	%ymm8,%ymm7,%ymm7
+	vpaddq	%ymm3,%ymm13,%ymm2
+	vpaddq	%ymm4,%ymm14,%ymm3
+	vpunpcklqdq	%ymm10,%ymm9,%ymm10
+	vpmuludq	80(%rax),%ymm0,%ymm4
+	vpmuludq	%ymm1,%ymm5,%ymm0
+	vmovdqa	64(%rcx),%ymm5
+	vpaddq	%ymm4,%ymm15,%ymm4
+	vpaddq	%ymm0,%ymm11,%ymm0
+
+
+
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm12,%ymm1
+
+	vpsrlq	$26,%ymm4,%ymm15
+	vpand	%ymm5,%ymm4,%ymm4
+
+	vpsrlq	$4,%ymm10,%ymm9
+
+	vpsrlq	$26,%ymm1,%ymm12
+	vpand	%ymm5,%ymm1,%ymm1
+	vpaddq	%ymm12,%ymm2,%ymm2
+
+	vpaddq	%ymm15,%ymm0,%ymm0
+	vpsllq	$2,%ymm15,%ymm15
+	vpaddq	%ymm15,%ymm0,%ymm0
+
+	vpand	%ymm5,%ymm9,%ymm9
+	vpsrlq	$26,%ymm7,%ymm8
+
+	vpsrlq	$26,%ymm2,%ymm13
+	vpand	%ymm5,%ymm2,%ymm2
+	vpaddq	%ymm13,%ymm3,%ymm3
+
+	vpaddq	%ymm9,%ymm2,%ymm2
+	vpsrlq	$30,%ymm10,%ymm10
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm1,%ymm1
+
+	vpsrlq	$40,%ymm6,%ymm6
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vpand	%ymm5,%ymm7,%ymm7
+	vpand	%ymm5,%ymm8,%ymm8
+	vpand	%ymm5,%ymm10,%ymm10
+	vpor	32(%rcx),%ymm6,%ymm6
+
+	subq	$64,%rdx
+	jnz	.Loop_avx2
+
+.byte	0x66,0x90
+.Ltail_avx2:
+
+
+
+
+
+
+
+	vpaddq	%ymm0,%ymm7,%ymm0
+	vmovdqu	4(%rsp),%ymm7
+	vpaddq	%ymm1,%ymm8,%ymm1
+	vmovdqu	36(%rsp),%ymm8
+	vpaddq	%ymm3,%ymm10,%ymm3
+	vmovdqu	100(%rsp),%ymm9
+	vpaddq	%ymm4,%ymm6,%ymm4
+	vmovdqu	52(%rax),%ymm10
+	vmovdqu	116(%rax),%ymm5
+
+	vpmuludq	%ymm2,%ymm7,%ymm13
+	vpmuludq	%ymm2,%ymm8,%ymm14
+	vpmuludq	%ymm2,%ymm9,%ymm15
+	vpmuludq	%ymm2,%ymm10,%ymm11
+	vpmuludq	%ymm2,%ymm5,%ymm12
+
+	vpmuludq	%ymm0,%ymm8,%ymm6
+	vpmuludq	%ymm1,%ymm8,%ymm2
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	68(%rsp),%ymm4,%ymm2
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm11,%ymm11
+
+	vpmuludq	%ymm0,%ymm7,%ymm6
+	vpmuludq	%ymm1,%ymm7,%ymm2
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vmovdqu	-12(%rax),%ymm8
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vpmuludq	%ymm3,%ymm7,%ymm6
+	vpmuludq	%ymm4,%ymm7,%ymm2
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm2,%ymm15,%ymm15
+
+	vpmuludq	%ymm3,%ymm8,%ymm6
+	vpmuludq	%ymm4,%ymm8,%ymm2
+	vpaddq	%ymm6,%ymm11,%ymm11
+	vpaddq	%ymm2,%ymm12,%ymm12
+	vmovdqu	20(%rax),%ymm2
+	vpmuludq	%ymm1,%ymm9,%ymm6
+	vpmuludq	%ymm0,%ymm9,%ymm9
+	vpaddq	%ymm6,%ymm14,%ymm14
+	vpaddq	%ymm9,%ymm13,%ymm13
+
+	vpmuludq	%ymm1,%ymm2,%ymm6
+	vpmuludq	%ymm0,%ymm2,%ymm2
+	vpaddq	%ymm6,%ymm15,%ymm15
+	vpaddq	%ymm2,%ymm14,%ymm14
+	vpmuludq	%ymm3,%ymm10,%ymm6
+	vpmuludq	%ymm4,%ymm10,%ymm2
+	vpaddq	%ymm6,%ymm12,%ymm12
+	vpaddq	%ymm2,%ymm13,%ymm13
+
+	vpmuludq	%ymm3,%ymm5,%ymm3
+	vpmuludq	%ymm4,%ymm5,%ymm4
+	vpaddq	%ymm3,%ymm13,%ymm2
+	vpaddq	%ymm4,%ymm14,%ymm3
+	vpmuludq	84(%rax),%ymm0,%ymm4
+	vpmuludq	%ymm1,%ymm5,%ymm0
+	vmovdqa	64(%rcx),%ymm5
+	vpaddq	%ymm4,%ymm15,%ymm4
+	vpaddq	%ymm0,%ymm11,%ymm0
+
+
+
+
+	vpsrldq	$8,%ymm12,%ymm8
+	vpsrldq	$8,%ymm2,%ymm9
+	vpsrldq	$8,%ymm3,%ymm10
+	vpsrldq	$8,%ymm4,%ymm6
+	vpsrldq	$8,%ymm0,%ymm7
+	vpaddq	%ymm8,%ymm12,%ymm12
+	vpaddq	%ymm9,%ymm2,%ymm2
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm6,%ymm4,%ymm4
+	vpaddq	%ymm7,%ymm0,%ymm0
+
+	vpermq	$0x2,%ymm3,%ymm10
+	vpermq	$0x2,%ymm4,%ymm6
+	vpermq	$0x2,%ymm0,%ymm7
+	vpermq	$0x2,%ymm12,%ymm8
+	vpermq	$0x2,%ymm2,%ymm9
+	vpaddq	%ymm10,%ymm3,%ymm3
+	vpaddq	%ymm6,%ymm4,%ymm4
+	vpaddq	%ymm7,%ymm0,%ymm0
+	vpaddq	%ymm8,%ymm12,%ymm12
+	vpaddq	%ymm9,%ymm2,%ymm2
+
+
+
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm12,%ymm1
+
+	vpsrlq	$26,%ymm4,%ymm15
+	vpand	%ymm5,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm1,%ymm12
+	vpand	%ymm5,%ymm1,%ymm1
+	vpaddq	%ymm12,%ymm2,%ymm2
+
+	vpaddq	%ymm15,%ymm0,%ymm0
+	vpsllq	$2,%ymm15,%ymm15
+	vpaddq	%ymm15,%ymm0,%ymm0
+
+	vpsrlq	$26,%ymm2,%ymm13
+	vpand	%ymm5,%ymm2,%ymm2
+	vpaddq	%ymm13,%ymm3,%ymm3
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm11,%ymm1,%ymm1
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vmovd	%xmm0,-112(%rdi)
+	vmovd	%xmm1,-108(%rdi)
+	vmovd	%xmm2,-104(%rdi)
+	vmovd	%xmm3,-100(%rdi)
+	vmovd	%xmm4,-96(%rdi)
+	leaq	8(%r11),%rsp
+.cfi_def_cfa	%rsp,8
+	vzeroupper
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
+.type	poly1305_blocks_avx512,@function
+.align	32
+poly1305_blocks_avx512:
+.cfi_startproc	
+.Lblocks_avx512:
+	movl	$15,%eax
+	kmovw	%eax,%k2
+	leaq	-8(%rsp),%r11
+.cfi_def_cfa	%r11,16
+	subq	$0x128,%rsp
+	leaq	.Lconst(%rip),%rcx
+	leaq	48+64(%rdi),%rdi
+	vmovdqa	96(%rcx),%ymm9
+
+
+	vmovdqu	-64(%rdi),%xmm11
+	andq	$-512,%rsp
+	vmovdqu	-48(%rdi),%xmm12
+	movq	$0x20,%rax
+	vmovdqu	-32(%rdi),%xmm7
+	vmovdqu	-16(%rdi),%xmm13
+	vmovdqu	0(%rdi),%xmm8
+	vmovdqu	16(%rdi),%xmm14
+	vmovdqu	32(%rdi),%xmm10
+	vmovdqu	48(%rdi),%xmm15
+	vmovdqu	64(%rdi),%xmm6
+	vpermd	%zmm11,%zmm9,%zmm16
+	vpbroadcastq	64(%rcx),%zmm5
+	vpermd	%zmm12,%zmm9,%zmm17
+	vpermd	%zmm7,%zmm9,%zmm21
+	vpermd	%zmm13,%zmm9,%zmm18
+	vmovdqa64	%zmm16,0(%rsp){%k2}
+	vpsrlq	$32,%zmm16,%zmm7
+	vpermd	%zmm8,%zmm9,%zmm22
+	vmovdqu64	%zmm17,0(%rsp,%rax,1){%k2}
+	vpsrlq	$32,%zmm17,%zmm8
+	vpermd	%zmm14,%zmm9,%zmm19
+	vmovdqa64	%zmm21,64(%rsp){%k2}
+	vpermd	%zmm10,%zmm9,%zmm23
+	vpermd	%zmm15,%zmm9,%zmm20
+	vmovdqu64	%zmm18,64(%rsp,%rax,1){%k2}
+	vpermd	%zmm6,%zmm9,%zmm24
+	vmovdqa64	%zmm22,128(%rsp){%k2}
+	vmovdqu64	%zmm19,128(%rsp,%rax,1){%k2}
+	vmovdqa64	%zmm23,192(%rsp){%k2}
+	vmovdqu64	%zmm20,192(%rsp,%rax,1){%k2}
+	vmovdqa64	%zmm24,256(%rsp){%k2}
+
+
+
+
+
+
+
+
+
+
+	vpmuludq	%zmm7,%zmm16,%zmm11
+	vpmuludq	%zmm7,%zmm17,%zmm12
+	vpmuludq	%zmm7,%zmm18,%zmm13
+	vpmuludq	%zmm7,%zmm19,%zmm14
+	vpmuludq	%zmm7,%zmm20,%zmm15
+	vpsrlq	$32,%zmm18,%zmm9
+
+	vpmuludq	%zmm8,%zmm24,%zmm25
+	vpmuludq	%zmm8,%zmm16,%zmm26
+	vpmuludq	%zmm8,%zmm17,%zmm27
+	vpmuludq	%zmm8,%zmm18,%zmm28
+	vpmuludq	%zmm8,%zmm19,%zmm29
+	vpsrlq	$32,%zmm19,%zmm10
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+
+	vpmuludq	%zmm9,%zmm23,%zmm25
+	vpmuludq	%zmm9,%zmm24,%zmm26
+	vpmuludq	%zmm9,%zmm17,%zmm28
+	vpmuludq	%zmm9,%zmm18,%zmm29
+	vpmuludq	%zmm9,%zmm16,%zmm27
+	vpsrlq	$32,%zmm20,%zmm6
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpmuludq	%zmm10,%zmm22,%zmm25
+	vpmuludq	%zmm10,%zmm16,%zmm28
+	vpmuludq	%zmm10,%zmm17,%zmm29
+	vpmuludq	%zmm10,%zmm23,%zmm26
+	vpmuludq	%zmm10,%zmm24,%zmm27
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpmuludq	%zmm6,%zmm24,%zmm28
+	vpmuludq	%zmm6,%zmm16,%zmm29
+	vpmuludq	%zmm6,%zmm21,%zmm25
+	vpmuludq	%zmm6,%zmm22,%zmm26
+	vpmuludq	%zmm6,%zmm23,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+
+
+	vmovdqu64	0(%rsi),%zmm10
+	vmovdqu64	64(%rsi),%zmm6
+	leaq	128(%rsi),%rsi
+
+
+
+
+	vpsrlq	$26,%zmm14,%zmm28
+	vpandq	%zmm5,%zmm14,%zmm14
+	vpaddq	%zmm28,%zmm15,%zmm15
+
+	vpsrlq	$26,%zmm11,%zmm25
+	vpandq	%zmm5,%zmm11,%zmm11
+	vpaddq	%zmm25,%zmm12,%zmm12
+
+	vpsrlq	$26,%zmm15,%zmm29
+	vpandq	%zmm5,%zmm15,%zmm15
+
+	vpsrlq	$26,%zmm12,%zmm26
+	vpandq	%zmm5,%zmm12,%zmm12
+	vpaddq	%zmm26,%zmm13,%zmm13
+
+	vpaddq	%zmm29,%zmm11,%zmm11
+	vpsllq	$2,%zmm29,%zmm29
+	vpaddq	%zmm29,%zmm11,%zmm11
+
+	vpsrlq	$26,%zmm13,%zmm27
+	vpandq	%zmm5,%zmm13,%zmm13
+	vpaddq	%zmm27,%zmm14,%zmm14
+
+	vpsrlq	$26,%zmm11,%zmm25
+	vpandq	%zmm5,%zmm11,%zmm11
+	vpaddq	%zmm25,%zmm12,%zmm12
+
+	vpsrlq	$26,%zmm14,%zmm28
+	vpandq	%zmm5,%zmm14,%zmm14
+	vpaddq	%zmm28,%zmm15,%zmm15
+
+
+
+
+
+	vpunpcklqdq	%zmm6,%zmm10,%zmm7
+	vpunpckhqdq	%zmm6,%zmm10,%zmm6
+
+
+
+
+
+
+	vmovdqa32	128(%rcx),%zmm25
+	movl	$0x7777,%eax
+	kmovw	%eax,%k1
+
+	vpermd	%zmm16,%zmm25,%zmm16
+	vpermd	%zmm17,%zmm25,%zmm17
+	vpermd	%zmm18,%zmm25,%zmm18
+	vpermd	%zmm19,%zmm25,%zmm19
+	vpermd	%zmm20,%zmm25,%zmm20
+
+	vpermd	%zmm11,%zmm25,%zmm16{%k1}
+	vpermd	%zmm12,%zmm25,%zmm17{%k1}
+	vpermd	%zmm13,%zmm25,%zmm18{%k1}
+	vpermd	%zmm14,%zmm25,%zmm19{%k1}
+	vpermd	%zmm15,%zmm25,%zmm20{%k1}
+
+	vpslld	$2,%zmm17,%zmm21
+	vpslld	$2,%zmm18,%zmm22
+	vpslld	$2,%zmm19,%zmm23
+	vpslld	$2,%zmm20,%zmm24
+	vpaddd	%zmm17,%zmm21,%zmm21
+	vpaddd	%zmm18,%zmm22,%zmm22
+	vpaddd	%zmm19,%zmm23,%zmm23
+	vpaddd	%zmm20,%zmm24,%zmm24
+
+	vpbroadcastq	32(%rcx),%zmm30
+
+	vpsrlq	$52,%zmm7,%zmm9
+	vpsllq	$12,%zmm6,%zmm10
+	vporq	%zmm10,%zmm9,%zmm9
+	vpsrlq	$26,%zmm7,%zmm8
+	vpsrlq	$14,%zmm6,%zmm10
+	vpsrlq	$40,%zmm6,%zmm6
+	vpandq	%zmm5,%zmm9,%zmm9
+	vpandq	%zmm5,%zmm7,%zmm7
+
+
+
+
+	vpaddq	%zmm2,%zmm9,%zmm2
+	subq	$192,%rdx
+	jbe	.Ltail_avx512
+	jmp	.Loop_avx512
+
+.align	32
+.Loop_avx512:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+	vpmuludq	%zmm2,%zmm17,%zmm14
+	vpaddq	%zmm0,%zmm7,%zmm0
+	vpmuludq	%zmm2,%zmm18,%zmm15
+	vpandq	%zmm5,%zmm8,%zmm8
+	vpmuludq	%zmm2,%zmm23,%zmm11
+	vpandq	%zmm5,%zmm10,%zmm10
+	vpmuludq	%zmm2,%zmm24,%zmm12
+	vporq	%zmm30,%zmm6,%zmm6
+	vpmuludq	%zmm2,%zmm16,%zmm13
+	vpaddq	%zmm1,%zmm8,%zmm1
+	vpaddq	%zmm3,%zmm10,%zmm3
+	vpaddq	%zmm4,%zmm6,%zmm4
+
+	vmovdqu64	0(%rsi),%zmm10
+	vmovdqu64	64(%rsi),%zmm6
+	leaq	128(%rsi),%rsi
+	vpmuludq	%zmm0,%zmm19,%zmm28
+	vpmuludq	%zmm0,%zmm20,%zmm29
+	vpmuludq	%zmm0,%zmm16,%zmm25
+	vpmuludq	%zmm0,%zmm17,%zmm26
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+
+	vpmuludq	%zmm1,%zmm18,%zmm28
+	vpmuludq	%zmm1,%zmm19,%zmm29
+	vpmuludq	%zmm1,%zmm24,%zmm25
+	vpmuludq	%zmm0,%zmm18,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpunpcklqdq	%zmm6,%zmm10,%zmm7
+	vpunpckhqdq	%zmm6,%zmm10,%zmm6
+
+	vpmuludq	%zmm3,%zmm16,%zmm28
+	vpmuludq	%zmm3,%zmm17,%zmm29
+	vpmuludq	%zmm1,%zmm16,%zmm26
+	vpmuludq	%zmm1,%zmm17,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpmuludq	%zmm4,%zmm24,%zmm28
+	vpmuludq	%zmm4,%zmm16,%zmm29
+	vpmuludq	%zmm3,%zmm22,%zmm25
+	vpmuludq	%zmm3,%zmm23,%zmm26
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpmuludq	%zmm3,%zmm24,%zmm27
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpmuludq	%zmm4,%zmm21,%zmm25
+	vpmuludq	%zmm4,%zmm22,%zmm26
+	vpmuludq	%zmm4,%zmm23,%zmm27
+	vpaddq	%zmm25,%zmm11,%zmm0
+	vpaddq	%zmm26,%zmm12,%zmm1
+	vpaddq	%zmm27,%zmm13,%zmm2
+
+
+
+
+	vpsrlq	$52,%zmm7,%zmm9
+	vpsllq	$12,%zmm6,%zmm10
+
+	vpsrlq	$26,%zmm14,%zmm3
+	vpandq	%zmm5,%zmm14,%zmm14
+	vpaddq	%zmm3,%zmm15,%zmm4
+
+	vporq	%zmm10,%zmm9,%zmm9
+
+	vpsrlq	$26,%zmm0,%zmm11
+	vpandq	%zmm5,%zmm0,%zmm0
+	vpaddq	%zmm11,%zmm1,%zmm1
+
+	vpandq	%zmm5,%zmm9,%zmm9
+
+	vpsrlq	$26,%zmm4,%zmm15
+	vpandq	%zmm5,%zmm4,%zmm4
+
+	vpsrlq	$26,%zmm1,%zmm12
+	vpandq	%zmm5,%zmm1,%zmm1
+	vpaddq	%zmm12,%zmm2,%zmm2
+
+	vpaddq	%zmm15,%zmm0,%zmm0
+	vpsllq	$2,%zmm15,%zmm15
+	vpaddq	%zmm15,%zmm0,%zmm0
+
+	vpaddq	%zmm9,%zmm2,%zmm2
+	vpsrlq	$26,%zmm7,%zmm8
+
+	vpsrlq	$26,%zmm2,%zmm13
+	vpandq	%zmm5,%zmm2,%zmm2
+	vpaddq	%zmm13,%zmm14,%zmm3
+
+	vpsrlq	$14,%zmm6,%zmm10
+
+	vpsrlq	$26,%zmm0,%zmm11
+	vpandq	%zmm5,%zmm0,%zmm0
+	vpaddq	%zmm11,%zmm1,%zmm1
+
+	vpsrlq	$40,%zmm6,%zmm6
+
+	vpsrlq	$26,%zmm3,%zmm14
+	vpandq	%zmm5,%zmm3,%zmm3
+	vpaddq	%zmm14,%zmm4,%zmm4
+
+	vpandq	%zmm5,%zmm7,%zmm7
+
+
+
+
+	subq	$128,%rdx
+	ja	.Loop_avx512
+
+.Ltail_avx512:
+
+
+
+
+
+	vpsrlq	$32,%zmm16,%zmm16
+	vpsrlq	$32,%zmm17,%zmm17
+	vpsrlq	$32,%zmm18,%zmm18
+	vpsrlq	$32,%zmm23,%zmm23
+	vpsrlq	$32,%zmm24,%zmm24
+	vpsrlq	$32,%zmm19,%zmm19
+	vpsrlq	$32,%zmm20,%zmm20
+	vpsrlq	$32,%zmm21,%zmm21
+	vpsrlq	$32,%zmm22,%zmm22
+
+
+
+	leaq	(%rsi,%rdx,1),%rsi
+
+
+	vpaddq	%zmm0,%zmm7,%zmm0
+
+	vpmuludq	%zmm2,%zmm17,%zmm14
+	vpmuludq	%zmm2,%zmm18,%zmm15
+	vpmuludq	%zmm2,%zmm23,%zmm11
+	vpandq	%zmm5,%zmm8,%zmm8
+	vpmuludq	%zmm2,%zmm24,%zmm12
+	vpandq	%zmm5,%zmm10,%zmm10
+	vpmuludq	%zmm2,%zmm16,%zmm13
+	vporq	%zmm30,%zmm6,%zmm6
+	vpaddq	%zmm1,%zmm8,%zmm1
+	vpaddq	%zmm3,%zmm10,%zmm3
+	vpaddq	%zmm4,%zmm6,%zmm4
+
+	vmovdqu	0(%rsi),%xmm7
+	vpmuludq	%zmm0,%zmm19,%zmm28
+	vpmuludq	%zmm0,%zmm20,%zmm29
+	vpmuludq	%zmm0,%zmm16,%zmm25
+	vpmuludq	%zmm0,%zmm17,%zmm26
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+
+	vmovdqu	16(%rsi),%xmm8
+	vpmuludq	%zmm1,%zmm18,%zmm28
+	vpmuludq	%zmm1,%zmm19,%zmm29
+	vpmuludq	%zmm1,%zmm24,%zmm25
+	vpmuludq	%zmm0,%zmm18,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vinserti128	$1,32(%rsi),%ymm7,%ymm7
+	vpmuludq	%zmm3,%zmm16,%zmm28
+	vpmuludq	%zmm3,%zmm17,%zmm29
+	vpmuludq	%zmm1,%zmm16,%zmm26
+	vpmuludq	%zmm1,%zmm17,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm14
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vinserti128	$1,48(%rsi),%ymm8,%ymm8
+	vpmuludq	%zmm4,%zmm24,%zmm28
+	vpmuludq	%zmm4,%zmm16,%zmm29
+	vpmuludq	%zmm3,%zmm22,%zmm25
+	vpmuludq	%zmm3,%zmm23,%zmm26
+	vpmuludq	%zmm3,%zmm24,%zmm27
+	vpaddq	%zmm28,%zmm14,%zmm3
+	vpaddq	%zmm29,%zmm15,%zmm15
+	vpaddq	%zmm25,%zmm11,%zmm11
+	vpaddq	%zmm26,%zmm12,%zmm12
+	vpaddq	%zmm27,%zmm13,%zmm13
+
+	vpmuludq	%zmm4,%zmm21,%zmm25
+	vpmuludq	%zmm4,%zmm22,%zmm26
+	vpmuludq	%zmm4,%zmm23,%zmm27
+	vpaddq	%zmm25,%zmm11,%zmm0
+	vpaddq	%zmm26,%zmm12,%zmm1
+	vpaddq	%zmm27,%zmm13,%zmm2
+
+
+
+
+	movl	$1,%eax
+	vpermq	$0xb1,%zmm3,%zmm14
+	vpermq	$0xb1,%zmm15,%zmm4
+	vpermq	$0xb1,%zmm0,%zmm11
+	vpermq	$0xb1,%zmm1,%zmm12
+	vpermq	$0xb1,%zmm2,%zmm13
+	vpaddq	%zmm14,%zmm3,%zmm3
+	vpaddq	%zmm15,%zmm4,%zmm4
+	vpaddq	%zmm11,%zmm0,%zmm0
+	vpaddq	%zmm12,%zmm1,%zmm1
+	vpaddq	%zmm13,%zmm2,%zmm2
+
+	kmovw	%eax,%k3
+	vpermq	$0x2,%zmm3,%zmm14
+	vpermq	$0x2,%zmm4,%zmm15
+	vpermq	$0x2,%zmm0,%zmm11
+	vpermq	$0x2,%zmm1,%zmm12
+	vpermq	$0x2,%zmm2,%zmm13
+	vpaddq	%zmm14,%zmm3,%zmm3
+	vpaddq	%zmm15,%zmm4,%zmm4
+	vpaddq	%zmm11,%zmm0,%zmm0
+	vpaddq	%zmm12,%zmm1,%zmm1
+	vpaddq	%zmm13,%zmm2,%zmm2
+
+	vextracti64x4	$0x1,%zmm3,%ymm14
+	vextracti64x4	$0x1,%zmm4,%ymm15
+	vextracti64x4	$0x1,%zmm0,%ymm11
+	vextracti64x4	$0x1,%zmm1,%ymm12
+	vextracti64x4	$0x1,%zmm2,%ymm13
+	vpaddq	%zmm14,%zmm3,%zmm3{%k3}{z}
+	vpaddq	%zmm15,%zmm4,%zmm4{%k3}{z}
+	vpaddq	%zmm11,%zmm0,%zmm0{%k3}{z}
+	vpaddq	%zmm12,%zmm1,%zmm1{%k3}{z}
+	vpaddq	%zmm13,%zmm2,%zmm2{%k3}{z}
+
+
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpsrldq	$6,%ymm7,%ymm9
+	vpsrldq	$6,%ymm8,%ymm10
+	vpunpckhqdq	%ymm8,%ymm7,%ymm6
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpunpcklqdq	%ymm10,%ymm9,%ymm9
+	vpunpcklqdq	%ymm8,%ymm7,%ymm7
+	vpaddq	%ymm11,%ymm1,%ymm1
+
+	vpsrlq	$26,%ymm4,%ymm15
+	vpand	%ymm5,%ymm4,%ymm4
+
+	vpsrlq	$26,%ymm1,%ymm12
+	vpand	%ymm5,%ymm1,%ymm1
+	vpsrlq	$30,%ymm9,%ymm10
+	vpsrlq	$4,%ymm9,%ymm9
+	vpaddq	%ymm12,%ymm2,%ymm2
+
+	vpaddq	%ymm15,%ymm0,%ymm0
+	vpsllq	$2,%ymm15,%ymm15
+	vpsrlq	$26,%ymm7,%ymm8
+	vpsrlq	$40,%ymm6,%ymm6
+	vpaddq	%ymm15,%ymm0,%ymm0
+
+	vpsrlq	$26,%ymm2,%ymm13
+	vpand	%ymm5,%ymm2,%ymm2
+	vpand	%ymm5,%ymm9,%ymm9
+	vpand	%ymm5,%ymm7,%ymm7
+	vpaddq	%ymm13,%ymm3,%ymm3
+
+	vpsrlq	$26,%ymm0,%ymm11
+	vpand	%ymm5,%ymm0,%ymm0
+	vpaddq	%ymm2,%ymm9,%ymm2
+	vpand	%ymm5,%ymm8,%ymm8
+	vpaddq	%ymm11,%ymm1,%ymm1
+
+	vpsrlq	$26,%ymm3,%ymm14
+	vpand	%ymm5,%ymm3,%ymm3
+	vpand	%ymm5,%ymm10,%ymm10
+	vpor	32(%rcx),%ymm6,%ymm6
+	vpaddq	%ymm14,%ymm4,%ymm4
+
+	leaq	144(%rsp),%rax
+	addq	$64,%rdx
+	jnz	.Ltail_avx2
+
+	vpsubq	%ymm9,%ymm2,%ymm2
+	vmovd	%xmm0,-112(%rdi)
+	vmovd	%xmm1,-108(%rdi)
+	vmovd	%xmm2,-104(%rdi)
+	vmovd	%xmm3,-100(%rdi)
+	vmovd	%xmm4,-96(%rdi)
+	vzeroall
+	leaq	8(%r11),%rsp
+.cfi_def_cfa	%rsp,8
+	.byte	0xf3,0xc3
+.cfi_endproc	
+.size	poly1305_blocks_avx512,.-poly1305_blocks_avx512
+.type	poly1305_init_base2_44,@function
+.align	32
+poly1305_init_base2_44:
+	xorq	%rax,%rax
+	movq	%rax,0(%rdi)
+	movq	%rax,8(%rdi)
+	movq	%rax,16(%rdi)
+
+.Linit_base2_44:
+	leaq	poly1305_blocks_vpmadd52(%rip),%r10
+	leaq	poly1305_emit_base2_44(%rip),%r11
+
+	movq	$0x0ffffffc0fffffff,%rax
+	movq	$0x0ffffffc0ffffffc,%rcx
+	andq	0(%rsi),%rax
+	movq	$0x00000fffffffffff,%r8
+	andq	8(%rsi),%rcx
+	movq	$0x00000fffffffffff,%r9
+	andq	%rax,%r8
+	shrdq	$44,%rcx,%rax
+	movq	%r8,40(%rdi)
+	andq	%r9,%rax
+	shrq	$24,%rcx
+	movq	%rax,48(%rdi)
+	leaq	(%rax,%rax,4),%rax
+	movq	%rcx,56(%rdi)
+	shlq	$2,%rax
+	leaq	(%rcx,%rcx,4),%rcx
+	shlq	$2,%rcx
+	movq	%rax,24(%rdi)
+	movq	%rcx,32(%rdi)
+	movq	$-1,64(%rdi)
+	movq	%r10,0(%rdx)
+	movq	%r11,8(%rdx)
+	movl	$1,%eax
+	.byte	0xf3,0xc3
+.size	poly1305_init_base2_44,.-poly1305_init_base2_44
+.type	poly1305_blocks_vpmadd52,@function
+.align	32
+poly1305_blocks_vpmadd52:
+	shrq	$4,%rdx
+	jz	.Lno_data_vpmadd52
+
+	shlq	$40,%rcx
+	movq	64(%rdi),%r8
+
+
+
+
+
+
+	movq	$3,%rax
+	movq	$1,%r10
+	cmpq	$4,%rdx
+	cmovaeq	%r10,%rax
+	testq	%r8,%r8
+	cmovnsq	%r10,%rax
+
+	andq	%rdx,%rax
+	jz	.Lblocks_vpmadd52_4x
+
+	subq	%rax,%rdx
+	movl	$7,%r10d
+	movl	$1,%r11d
+	kmovw	%r10d,%k7
+	leaq	.L2_44_inp_permd(%rip),%r10
+	kmovw	%r11d,%k1
+
+	vmovq	%rcx,%xmm21
+	vmovdqa64	0(%r10),%ymm19
+	vmovdqa64	32(%r10),%ymm20
+	vpermq	$0xcf,%ymm21,%ymm21
+	vmovdqa64	64(%r10),%ymm22
+
+	vmovdqu64	0(%rdi),%ymm16{%k7}{z}
+	vmovdqu64	40(%rdi),%ymm3{%k7}{z}
+	vmovdqu64	32(%rdi),%ymm4{%k7}{z}
+	vmovdqu64	24(%rdi),%ymm5{%k7}{z}
+
+	vmovdqa64	96(%r10),%ymm23
+	vmovdqa64	128(%r10),%ymm24
+
+	jmp	.Loop_vpmadd52
+
+.align	32
+.Loop_vpmadd52:
+	vmovdqu32	0(%rsi),%xmm18
+	leaq	16(%rsi),%rsi
+
+	vpermd	%ymm18,%ymm19,%ymm18
+	vpsrlvq	%ymm20,%ymm18,%ymm18
+	vpandq	%ymm22,%ymm18,%ymm18
+	vporq	%ymm21,%ymm18,%ymm18
+
+	vpaddq	%ymm18,%ymm16,%ymm16
+
+	vpermq	$0,%ymm16,%ymm0{%k7}{z}
+	vpermq	$85,%ymm16,%ymm1{%k7}{z}
+	vpermq	$170,%ymm16,%ymm2{%k7}{z}
+
+	vpxord	%ymm16,%ymm16,%ymm16
+	vpxord	%ymm17,%ymm17,%ymm17
+
+	vpmadd52luq	%ymm3,%ymm0,%ymm16
+	vpmadd52huq	%ymm3,%ymm0,%ymm17
+
+	vpmadd52luq	%ymm4,%ymm1,%ymm16
+	vpmadd52huq	%ymm4,%ymm1,%ymm17
+
+	vpmadd52luq	%ymm5,%ymm2,%ymm16
+	vpmadd52huq	%ymm5,%ymm2,%ymm17
+
+	vpsrlvq	%ymm23,%ymm16,%ymm18
+	vpsllvq	%ymm24,%ymm17,%ymm17
+	vpandq	%ymm22,%ymm16,%ymm16
+
+	vpaddq	%ymm18,%ymm17,%ymm17
+
+	vpermq	$147,%ymm17,%ymm17
+
+	vpaddq	%ymm17,%ymm16,%ymm16
+
+	vpsrlvq	%ymm23,%ymm16,%ymm18
+	vpandq	%ymm22,%ymm16,%ymm16
+
+	vpermq	$147,%ymm18,%ymm18
+
+	vpaddq	%ymm18,%ymm16,%ymm16
+
+	vpermq	$147,%ymm16,%ymm18{%k1}{z}
+
+	vpaddq	%ymm18,%ymm16,%ymm16
+	vpsllq	$2,%ymm18,%ymm18
+
+	vpaddq	%ymm18,%ymm16,%ymm16
+
+	decq	%rax
+	jnz	.Loop_vpmadd52
+
+	vmovdqu64	%ymm16,0(%rdi){%k7}
+
+	testq	%rdx,%rdx
+	jnz	.Lblocks_vpmadd52_4x
+
+.Lno_data_vpmadd52:
+	.byte	0xf3,0xc3
+.size	poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
+.type	poly1305_blocks_vpmadd52_4x,@function
+.align	32
+poly1305_blocks_vpmadd52_4x:
+	shrq	$4,%rdx
+	jz	.Lno_data_vpmadd52_4x
+
+	shlq	$40,%rcx
+	movq	64(%rdi),%r8
+
+.Lblocks_vpmadd52_4x:
+	vpbroadcastq	%rcx,%ymm31
+
+	vmovdqa64	.Lx_mask44(%rip),%ymm28
+	movl	$5,%eax
+	vmovdqa64	.Lx_mask42(%rip),%ymm29
+	kmovw	%eax,%k1
+
+	testq	%r8,%r8
+	js	.Linit_vpmadd52
+
+	vmovq	0(%rdi),%xmm0
+	vmovq	8(%rdi),%xmm1
+	vmovq	16(%rdi),%xmm2
+
+	testq	$3,%rdx
+	jnz	.Lblocks_vpmadd52_2x_do
+
+.Lblocks_vpmadd52_4x_do:
+	vpbroadcastq	64(%rdi),%ymm3
+	vpbroadcastq	96(%rdi),%ymm4
+	vpbroadcastq	128(%rdi),%ymm5
+	vpbroadcastq	160(%rdi),%ymm16
+
+.Lblocks_vpmadd52_4x_key_loaded:
+	vpsllq	$2,%ymm5,%ymm17
+	vpaddq	%ymm5,%ymm17,%ymm17
+	vpsllq	$2,%ymm17,%ymm17
+
+	testq	$7,%rdx
+	jz	.Lblocks_vpmadd52_8x
+
+	vmovdqu64	0(%rsi),%ymm26
+	vmovdqu64	32(%rsi),%ymm27
+	leaq	64(%rsi),%rsi
+
+	vpunpcklqdq	%ymm27,%ymm26,%ymm25
+	vpunpckhqdq	%ymm27,%ymm26,%ymm27
+
+
+
+	vpsrlq	$24,%ymm27,%ymm26
+	vporq	%ymm31,%ymm26,%ymm26
+	vpaddq	%ymm26,%ymm2,%ymm2
+	vpandq	%ymm28,%ymm25,%ymm24
+	vpsrlq	$44,%ymm25,%ymm25
+	vpsllq	$20,%ymm27,%ymm27
+	vporq	%ymm27,%ymm25,%ymm25
+	vpandq	%ymm28,%ymm25,%ymm25
+
+	subq	$4,%rdx
+	jz	.Ltail_vpmadd52_4x
+	jmp	.Loop_vpmadd52_4x
+	ud2
+
+.align	32
+.Linit_vpmadd52:
+	vmovq	24(%rdi),%xmm16
+	vmovq	56(%rdi),%xmm2
+	vmovq	32(%rdi),%xmm17
+	vmovq	40(%rdi),%xmm3
+	vmovq	48(%rdi),%xmm4
+
+	vmovdqa	%ymm3,%ymm0
+	vmovdqa	%ymm4,%ymm1
+	vmovdqa	%ymm2,%ymm5
+
+	movl	$2,%eax
+
+.Lmul_init_vpmadd52:
+	vpxorq	%ymm18,%ymm18,%ymm18
+	vpmadd52luq	%ymm2,%ymm16,%ymm18
+	vpxorq	%ymm19,%ymm19,%ymm19
+	vpmadd52huq	%ymm2,%ymm16,%ymm19
+	vpxorq	%ymm20,%ymm20,%ymm20
+	vpmadd52luq	%ymm2,%ymm17,%ymm20
+	vpxorq	%ymm21,%ymm21,%ymm21
+	vpmadd52huq	%ymm2,%ymm17,%ymm21
+	vpxorq	%ymm22,%ymm22,%ymm22
+	vpmadd52luq	%ymm2,%ymm3,%ymm22
+	vpxorq	%ymm23,%ymm23,%ymm23
+	vpmadd52huq	%ymm2,%ymm3,%ymm23
+
+	vpmadd52luq	%ymm0,%ymm3,%ymm18
+	vpmadd52huq	%ymm0,%ymm3,%ymm19
+	vpmadd52luq	%ymm0,%ymm4,%ymm20
+	vpmadd52huq	%ymm0,%ymm4,%ymm21
+	vpmadd52luq	%ymm0,%ymm5,%ymm22
+	vpmadd52huq	%ymm0,%ymm5,%ymm23
+
+	vpmadd52luq	%ymm1,%ymm17,%ymm18
+	vpmadd52huq	%ymm1,%ymm17,%ymm19
+	vpmadd52luq	%ymm1,%ymm3,%ymm20
+	vpmadd52huq	%ymm1,%ymm3,%ymm21
+	vpmadd52luq	%ymm1,%ymm4,%ymm22
+	vpmadd52huq	%ymm1,%ymm4,%ymm23
+
+
+
+	vpsrlq	$44,%ymm18,%ymm30
+	vpsllq	$8,%ymm19,%ymm19
+	vpandq	%ymm28,%ymm18,%ymm0
+	vpaddq	%ymm30,%ymm19,%ymm19
+
+	vpaddq	%ymm19,%ymm20,%ymm20
+
+	vpsrlq	$44,%ymm20,%ymm30
+	vpsllq	$8,%ymm21,%ymm21
+	vpandq	%ymm28,%ymm20,%ymm1
+	vpaddq	%ymm30,%ymm21,%ymm21
+
+	vpaddq	%ymm21,%ymm22,%ymm22
+
+	vpsrlq	$42,%ymm22,%ymm30
+	vpsllq	$10,%ymm23,%ymm23
+	vpandq	%ymm29,%ymm22,%ymm2
+	vpaddq	%ymm30,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm0,%ymm0
+	vpsllq	$2,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm0,%ymm0
+
+	vpsrlq	$44,%ymm0,%ymm30
+	vpandq	%ymm28,%ymm0,%ymm0
+
+	vpaddq	%ymm30,%ymm1,%ymm1
+
+	decl	%eax
+	jz	.Ldone_init_vpmadd52
+
+	vpunpcklqdq	%ymm4,%ymm1,%ymm4
+	vpbroadcastq	%xmm1,%xmm1
+	vpunpcklqdq	%ymm5,%ymm2,%ymm5
+	vpbroadcastq	%xmm2,%xmm2
+	vpunpcklqdq	%ymm3,%ymm0,%ymm3
+	vpbroadcastq	%xmm0,%xmm0
+
+	vpsllq	$2,%ymm4,%ymm16
+	vpsllq	$2,%ymm5,%ymm17
+	vpaddq	%ymm4,%ymm16,%ymm16
+	vpaddq	%ymm5,%ymm17,%ymm17
+	vpsllq	$2,%ymm16,%ymm16
+	vpsllq	$2,%ymm17,%ymm17
+
+	jmp	.Lmul_init_vpmadd52
+	ud2
+
+.align	32
+.Ldone_init_vpmadd52:
+	vinserti128	$1,%xmm4,%ymm1,%ymm4
+	vinserti128	$1,%xmm5,%ymm2,%ymm5
+	vinserti128	$1,%xmm3,%ymm0,%ymm3
+
+	vpermq	$216,%ymm4,%ymm4
+	vpermq	$216,%ymm5,%ymm5
+	vpermq	$216,%ymm3,%ymm3
+
+	vpsllq	$2,%ymm4,%ymm16
+	vpaddq	%ymm4,%ymm16,%ymm16
+	vpsllq	$2,%ymm16,%ymm16
+
+	vmovq	0(%rdi),%xmm0
+	vmovq	8(%rdi),%xmm1
+	vmovq	16(%rdi),%xmm2
+
+	testq	$3,%rdx
+	jnz	.Ldone_init_vpmadd52_2x
+
+	vmovdqu64	%ymm3,64(%rdi)
+	vpbroadcastq	%xmm3,%ymm3
+	vmovdqu64	%ymm4,96(%rdi)
+	vpbroadcastq	%xmm4,%ymm4
+	vmovdqu64	%ymm5,128(%rdi)
+	vpbroadcastq	%xmm5,%ymm5
+	vmovdqu64	%ymm16,160(%rdi)
+	vpbroadcastq	%xmm16,%ymm16
+
+	jmp	.Lblocks_vpmadd52_4x_key_loaded
+	ud2
+
+.align	32
+.Ldone_init_vpmadd52_2x:
+	vmovdqu64	%ymm3,64(%rdi)
+	vpsrldq	$8,%ymm3,%ymm3
+	vmovdqu64	%ymm4,96(%rdi)
+	vpsrldq	$8,%ymm4,%ymm4
+	vmovdqu64	%ymm5,128(%rdi)
+	vpsrldq	$8,%ymm5,%ymm5
+	vmovdqu64	%ymm16,160(%rdi)
+	vpsrldq	$8,%ymm16,%ymm16
+	jmp	.Lblocks_vpmadd52_2x_key_loaded
+	ud2
+
+.align	32
+.Lblocks_vpmadd52_2x_do:
+	vmovdqu64	128+8(%rdi),%ymm5{%k1}{z}
+	vmovdqu64	160+8(%rdi),%ymm16{%k1}{z}
+	vmovdqu64	64+8(%rdi),%ymm3{%k1}{z}
+	vmovdqu64	96+8(%rdi),%ymm4{%k1}{z}
+
+.Lblocks_vpmadd52_2x_key_loaded:
+	vmovdqu64	0(%rsi),%ymm26
+	vpxorq	%ymm27,%ymm27,%ymm27
+	leaq	32(%rsi),%rsi
+
+	vpunpcklqdq	%ymm27,%ymm26,%ymm25
+	vpunpckhqdq	%ymm27,%ymm26,%ymm27
+
+
+
+	vpsrlq	$24,%ymm27,%ymm26
+	vporq	%ymm31,%ymm26,%ymm26
+	vpaddq	%ymm26,%ymm2,%ymm2
+	vpandq	%ymm28,%ymm25,%ymm24
+	vpsrlq	$44,%ymm25,%ymm25
+	vpsllq	$20,%ymm27,%ymm27
+	vporq	%ymm27,%ymm25,%ymm25
+	vpandq	%ymm28,%ymm25,%ymm25
+
+	jmp	.Ltail_vpmadd52_2x
+	ud2
+
+.align	32
+.Loop_vpmadd52_4x:
+
+	vpaddq	%ymm24,%ymm0,%ymm0
+	vpaddq	%ymm25,%ymm1,%ymm1
+
+	vpxorq	%ymm18,%ymm18,%ymm18
+	vpmadd52luq	%ymm2,%ymm16,%ymm18
+	vpxorq	%ymm19,%ymm19,%ymm19
+	vpmadd52huq	%ymm2,%ymm16,%ymm19
+	vpxorq	%ymm20,%ymm20,%ymm20
+	vpmadd52luq	%ymm2,%ymm17,%ymm20
+	vpxorq	%ymm21,%ymm21,%ymm21
+	vpmadd52huq	%ymm2,%ymm17,%ymm21
+	vpxorq	%ymm22,%ymm22,%ymm22
+	vpmadd52luq	%ymm2,%ymm3,%ymm22
+	vpxorq	%ymm23,%ymm23,%ymm23
+	vpmadd52huq	%ymm2,%ymm3,%ymm23
+
+	vmovdqu64	0(%rsi),%ymm26
+	vmovdqu64	32(%rsi),%ymm27
+	leaq	64(%rsi),%rsi
+	vpmadd52luq	%ymm0,%ymm3,%ymm18
+	vpmadd52huq	%ymm0,%ymm3,%ymm19
+	vpmadd52luq	%ymm0,%ymm4,%ymm20
+	vpmadd52huq	%ymm0,%ymm4,%ymm21
+	vpmadd52luq	%ymm0,%ymm5,%ymm22
+	vpmadd52huq	%ymm0,%ymm5,%ymm23
+
+	vpunpcklqdq	%ymm27,%ymm26,%ymm25
+	vpunpckhqdq	%ymm27,%ymm26,%ymm27
+	vpmadd52luq	%ymm1,%ymm17,%ymm18
+	vpmadd52huq	%ymm1,%ymm17,%ymm19
+	vpmadd52luq	%ymm1,%ymm3,%ymm20
+	vpmadd52huq	%ymm1,%ymm3,%ymm21
+	vpmadd52luq	%ymm1,%ymm4,%ymm22
+	vpmadd52huq	%ymm1,%ymm4,%ymm23
+
+
+
+	vpsrlq	$44,%ymm18,%ymm30
+	vpsllq	$8,%ymm19,%ymm19
+	vpandq	%ymm28,%ymm18,%ymm0
+	vpaddq	%ymm30,%ymm19,%ymm19
+
+	vpsrlq	$24,%ymm27,%ymm26
+	vporq	%ymm31,%ymm26,%ymm26
+	vpaddq	%ymm19,%ymm20,%ymm20
+
+	vpsrlq	$44,%ymm20,%ymm30
+	vpsllq	$8,%ymm21,%ymm21
+	vpandq	%ymm28,%ymm20,%ymm1
+	vpaddq	%ymm30,%ymm21,%ymm21
+
+	vpandq	%ymm28,%ymm25,%ymm24
+	vpsrlq	$44,%ymm25,%ymm25
+	vpsllq	$20,%ymm27,%ymm27
+	vpaddq	%ymm21,%ymm22,%ymm22
+
+	vpsrlq	$42,%ymm22,%ymm30
+	vpsllq	$10,%ymm23,%ymm23
+	vpandq	%ymm29,%ymm22,%ymm2
+	vpaddq	%ymm30,%ymm23,%ymm23
+
+	vpaddq	%ymm26,%ymm2,%ymm2
+	vpaddq	%ymm23,%ymm0,%ymm0
+	vpsllq	$2,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm0,%ymm0
+	vporq	%ymm27,%ymm25,%ymm25
+	vpandq	%ymm28,%ymm25,%ymm25
+
+	vpsrlq	$44,%ymm0,%ymm30
+	vpandq	%ymm28,%ymm0,%ymm0
+
+	vpaddq	%ymm30,%ymm1,%ymm1
+
+	subq	$4,%rdx
+	jnz	.Loop_vpmadd52_4x
+
+.Ltail_vpmadd52_4x:
+	vmovdqu64	128(%rdi),%ymm5
+	vmovdqu64	160(%rdi),%ymm16
+	vmovdqu64	64(%rdi),%ymm3
+	vmovdqu64	96(%rdi),%ymm4
+
+.Ltail_vpmadd52_2x:
+	vpsllq	$2,%ymm5,%ymm17
+	vpaddq	%ymm5,%ymm17,%ymm17
+	vpsllq	$2,%ymm17,%ymm17
+
+
+	vpaddq	%ymm24,%ymm0,%ymm0
+	vpaddq	%ymm25,%ymm1,%ymm1
+
+	vpxorq	%ymm18,%ymm18,%ymm18
+	vpmadd52luq	%ymm2,%ymm16,%ymm18
+	vpxorq	%ymm19,%ymm19,%ymm19
+	vpmadd52huq	%ymm2,%ymm16,%ymm19
+	vpxorq	%ymm20,%ymm20,%ymm20
+	vpmadd52luq	%ymm2,%ymm17,%ymm20
+	vpxorq	%ymm21,%ymm21,%ymm21
+	vpmadd52huq	%ymm2,%ymm17,%ymm21
+	vpxorq	%ymm22,%ymm22,%ymm22
+	vpmadd52luq	%ymm2,%ymm3,%ymm22
+	vpxorq	%ymm23,%ymm23,%ymm23
+	vpmadd52huq	%ymm2,%ymm3,%ymm23
+
+	vpmadd52luq	%ymm0,%ymm3,%ymm18
+	vpmadd52huq	%ymm0,%ymm3,%ymm19
+	vpmadd52luq	%ymm0,%ymm4,%ymm20
+	vpmadd52huq	%ymm0,%ymm4,%ymm21
+	vpmadd52luq	%ymm0,%ymm5,%ymm22
+	vpmadd52huq	%ymm0,%ymm5,%ymm23
+
+	vpmadd52luq	%ymm1,%ymm17,%ymm18
+	vpmadd52huq	%ymm1,%ymm17,%ymm19
+	vpmadd52luq	%ymm1,%ymm3,%ymm20
+	vpmadd52huq	%ymm1,%ymm3,%ymm21
+	vpmadd52luq	%ymm1,%ymm4,%ymm22
+	vpmadd52huq	%ymm1,%ymm4,%ymm23
+
+
+
+
+	movl	$1,%eax
+	kmovw	%eax,%k1
+	vpsrldq	$8,%ymm18,%ymm24
+	vpsrldq	$8,%ymm19,%ymm0
+	vpsrldq	$8,%ymm20,%ymm25
+	vpsrldq	$8,%ymm21,%ymm1
+	vpaddq	%ymm24,%ymm18,%ymm18
+	vpaddq	%ymm0,%ymm19,%ymm19
+	vpsrldq	$8,%ymm22,%ymm26
+	vpsrldq	$8,%ymm23,%ymm2
+	vpaddq	%ymm25,%ymm20,%ymm20
+	vpaddq	%ymm1,%ymm21,%ymm21
+	vpermq	$0x2,%ymm18,%ymm24
+	vpermq	$0x2,%ymm19,%ymm0
+	vpaddq	%ymm26,%ymm22,%ymm22
+	vpaddq	%ymm2,%ymm23,%ymm23
+
+	vpermq	$0x2,%ymm20,%ymm25
+	vpermq	$0x2,%ymm21,%ymm1
+	vpaddq	%ymm24,%ymm18,%ymm18{%k1}{z}
+	vpaddq	%ymm0,%ymm19,%ymm19{%k1}{z}
+	vpermq	$0x2,%ymm22,%ymm26
+	vpermq	$0x2,%ymm23,%ymm2
+	vpaddq	%ymm25,%ymm20,%ymm20{%k1}{z}
+	vpaddq	%ymm1,%ymm21,%ymm21{%k1}{z}
+	vpaddq	%ymm26,%ymm22,%ymm22{%k1}{z}
+	vpaddq	%ymm2,%ymm23,%ymm23{%k1}{z}
+
+
+
+	vpsrlq	$44,%ymm18,%ymm30
+	vpsllq	$8,%ymm19,%ymm19
+	vpandq	%ymm28,%ymm18,%ymm0
+	vpaddq	%ymm30,%ymm19,%ymm19
+
+	vpaddq	%ymm19,%ymm20,%ymm20
+
+	vpsrlq	$44,%ymm20,%ymm30
+	vpsllq	$8,%ymm21,%ymm21
+	vpandq	%ymm28,%ymm20,%ymm1
+	vpaddq	%ymm30,%ymm21,%ymm21
+
+	vpaddq	%ymm21,%ymm22,%ymm22
+
+	vpsrlq	$42,%ymm22,%ymm30
+	vpsllq	$10,%ymm23,%ymm23
+	vpandq	%ymm29,%ymm22,%ymm2
+	vpaddq	%ymm30,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm0,%ymm0
+	vpsllq	$2,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm0,%ymm0
+
+	vpsrlq	$44,%ymm0,%ymm30
+	vpandq	%ymm28,%ymm0,%ymm0
+
+	vpaddq	%ymm30,%ymm1,%ymm1
+
+
+	subq	$2,%rdx
+	ja	.Lblocks_vpmadd52_4x_do
+
+	vmovq	%xmm0,0(%rdi)
+	vmovq	%xmm1,8(%rdi)
+	vmovq	%xmm2,16(%rdi)
+	vzeroall
+
+.Lno_data_vpmadd52_4x:
+	.byte	0xf3,0xc3
+.size	poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
+.type	poly1305_blocks_vpmadd52_8x,@function
+.align	32
+poly1305_blocks_vpmadd52_8x:
+	shrq	$4,%rdx
+	jz	.Lno_data_vpmadd52_8x
+
+	shlq	$40,%rcx
+	movq	64(%rdi),%r8
+
+	vmovdqa64	.Lx_mask44(%rip),%ymm28
+	vmovdqa64	.Lx_mask42(%rip),%ymm29
+
+	testq	%r8,%r8
+	js	.Linit_vpmadd52
+
+	vmovq	0(%rdi),%xmm0
+	vmovq	8(%rdi),%xmm1
+	vmovq	16(%rdi),%xmm2
+
+.Lblocks_vpmadd52_8x:
+
+
+
+	vmovdqu64	128(%rdi),%ymm5
+	vmovdqu64	160(%rdi),%ymm16
+	vmovdqu64	64(%rdi),%ymm3
+	vmovdqu64	96(%rdi),%ymm4
+
+	vpsllq	$2,%ymm5,%ymm17
+	vpaddq	%ymm5,%ymm17,%ymm17
+	vpsllq	$2,%ymm17,%ymm17
+
+	vpbroadcastq	%xmm5,%ymm8
+	vpbroadcastq	%xmm3,%ymm6
+	vpbroadcastq	%xmm4,%ymm7
+
+	vpxorq	%ymm18,%ymm18,%ymm18
+	vpmadd52luq	%ymm8,%ymm16,%ymm18
+	vpxorq	%ymm19,%ymm19,%ymm19
+	vpmadd52huq	%ymm8,%ymm16,%ymm19
+	vpxorq	%ymm20,%ymm20,%ymm20
+	vpmadd52luq	%ymm8,%ymm17,%ymm20
+	vpxorq	%ymm21,%ymm21,%ymm21
+	vpmadd52huq	%ymm8,%ymm17,%ymm21
+	vpxorq	%ymm22,%ymm22,%ymm22
+	vpmadd52luq	%ymm8,%ymm3,%ymm22
+	vpxorq	%ymm23,%ymm23,%ymm23
+	vpmadd52huq	%ymm8,%ymm3,%ymm23
+
+	vpmadd52luq	%ymm6,%ymm3,%ymm18
+	vpmadd52huq	%ymm6,%ymm3,%ymm19
+	vpmadd52luq	%ymm6,%ymm4,%ymm20
+	vpmadd52huq	%ymm6,%ymm4,%ymm21
+	vpmadd52luq	%ymm6,%ymm5,%ymm22
+	vpmadd52huq	%ymm6,%ymm5,%ymm23
+
+	vpmadd52luq	%ymm7,%ymm17,%ymm18
+	vpmadd52huq	%ymm7,%ymm17,%ymm19
+	vpmadd52luq	%ymm7,%ymm3,%ymm20
+	vpmadd52huq	%ymm7,%ymm3,%ymm21
+	vpmadd52luq	%ymm7,%ymm4,%ymm22
+	vpmadd52huq	%ymm7,%ymm4,%ymm23
+
+
+
+	vpsrlq	$44,%ymm18,%ymm30
+	vpsllq	$8,%ymm19,%ymm19
+	vpandq	%ymm28,%ymm18,%ymm6
+	vpaddq	%ymm30,%ymm19,%ymm19
+
+	vpaddq	%ymm19,%ymm20,%ymm20
+
+	vpsrlq	$44,%ymm20,%ymm30
+	vpsllq	$8,%ymm21,%ymm21
+	vpandq	%ymm28,%ymm20,%ymm7
+	vpaddq	%ymm30,%ymm21,%ymm21
+
+	vpaddq	%ymm21,%ymm22,%ymm22
+
+	vpsrlq	$42,%ymm22,%ymm30
+	vpsllq	$10,%ymm23,%ymm23
+	vpandq	%ymm29,%ymm22,%ymm8
+	vpaddq	%ymm30,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm6,%ymm6
+	vpsllq	$2,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm6,%ymm6
+
+	vpsrlq	$44,%ymm6,%ymm30
+	vpandq	%ymm28,%ymm6,%ymm6
+
+	vpaddq	%ymm30,%ymm7,%ymm7
+
+
+
+
+
+	vpunpcklqdq	%ymm5,%ymm8,%ymm26
+	vpunpckhqdq	%ymm5,%ymm8,%ymm5
+	vpunpcklqdq	%ymm3,%ymm6,%ymm24
+	vpunpckhqdq	%ymm3,%ymm6,%ymm3
+	vpunpcklqdq	%ymm4,%ymm7,%ymm25
+	vpunpckhqdq	%ymm4,%ymm7,%ymm4
+	vshufi64x2	$0x44,%zmm5,%zmm26,%zmm8
+	vshufi64x2	$0x44,%zmm3,%zmm24,%zmm6
+	vshufi64x2	$0x44,%zmm4,%zmm25,%zmm7
+
+	vmovdqu64	0(%rsi),%zmm26
+	vmovdqu64	64(%rsi),%zmm27
+	leaq	128(%rsi),%rsi
+
+	vpsllq	$2,%zmm8,%zmm10
+	vpsllq	$2,%zmm7,%zmm9
+	vpaddq	%zmm8,%zmm10,%zmm10
+	vpaddq	%zmm7,%zmm9,%zmm9
+	vpsllq	$2,%zmm10,%zmm10
+	vpsllq	$2,%zmm9,%zmm9
+
+	vpbroadcastq	%rcx,%zmm31
+	vpbroadcastq	%xmm28,%zmm28
+	vpbroadcastq	%xmm29,%zmm29
+
+	vpbroadcastq	%xmm9,%zmm16
+	vpbroadcastq	%xmm10,%zmm17
+	vpbroadcastq	%xmm6,%zmm3
+	vpbroadcastq	%xmm7,%zmm4
+	vpbroadcastq	%xmm8,%zmm5
+
+	vpunpcklqdq	%zmm27,%zmm26,%zmm25
+	vpunpckhqdq	%zmm27,%zmm26,%zmm27
+
+
+
+	vpsrlq	$24,%zmm27,%zmm26
+	vporq	%zmm31,%zmm26,%zmm26
+	vpaddq	%zmm26,%zmm2,%zmm2
+	vpandq	%zmm28,%zmm25,%zmm24
+	vpsrlq	$44,%zmm25,%zmm25
+	vpsllq	$20,%zmm27,%zmm27
+	vporq	%zmm27,%zmm25,%zmm25
+	vpandq	%zmm28,%zmm25,%zmm25
+
+	subq	$8,%rdx
+	jz	.Ltail_vpmadd52_8x
+	jmp	.Loop_vpmadd52_8x
+
+.align	32
+.Loop_vpmadd52_8x:
+
+	vpaddq	%zmm24,%zmm0,%zmm0
+	vpaddq	%zmm25,%zmm1,%zmm1
+
+	vpxorq	%zmm18,%zmm18,%zmm18
+	vpmadd52luq	%zmm2,%zmm16,%zmm18
+	vpxorq	%zmm19,%zmm19,%zmm19
+	vpmadd52huq	%zmm2,%zmm16,%zmm19
+	vpxorq	%zmm20,%zmm20,%zmm20
+	vpmadd52luq	%zmm2,%zmm17,%zmm20
+	vpxorq	%zmm21,%zmm21,%zmm21
+	vpmadd52huq	%zmm2,%zmm17,%zmm21
+	vpxorq	%zmm22,%zmm22,%zmm22
+	vpmadd52luq	%zmm2,%zmm3,%zmm22
+	vpxorq	%zmm23,%zmm23,%zmm23
+	vpmadd52huq	%zmm2,%zmm3,%zmm23
+
+	vmovdqu64	0(%rsi),%zmm26
+	vmovdqu64	64(%rsi),%zmm27
+	leaq	128(%rsi),%rsi
+	vpmadd52luq	%zmm0,%zmm3,%zmm18
+	vpmadd52huq	%zmm0,%zmm3,%zmm19
+	vpmadd52luq	%zmm0,%zmm4,%zmm20
+	vpmadd52huq	%zmm0,%zmm4,%zmm21
+	vpmadd52luq	%zmm0,%zmm5,%zmm22
+	vpmadd52huq	%zmm0,%zmm5,%zmm23
+
+	vpunpcklqdq	%zmm27,%zmm26,%zmm25
+	vpunpckhqdq	%zmm27,%zmm26,%zmm27
+	vpmadd52luq	%zmm1,%zmm17,%zmm18
+	vpmadd52huq	%zmm1,%zmm17,%zmm19
+	vpmadd52luq	%zmm1,%zmm3,%zmm20
+	vpmadd52huq	%zmm1,%zmm3,%zmm21
+	vpmadd52luq	%zmm1,%zmm4,%zmm22
+	vpmadd52huq	%zmm1,%zmm4,%zmm23
+
+
+
+	vpsrlq	$44,%zmm18,%zmm30
+	vpsllq	$8,%zmm19,%zmm19
+	vpandq	%zmm28,%zmm18,%zmm0
+	vpaddq	%zmm30,%zmm19,%zmm19
+
+	vpsrlq	$24,%zmm27,%zmm26
+	vporq	%zmm31,%zmm26,%zmm26
+	vpaddq	%zmm19,%zmm20,%zmm20
+
+	vpsrlq	$44,%zmm20,%zmm30
+	vpsllq	$8,%zmm21,%zmm21
+	vpandq	%zmm28,%zmm20,%zmm1
+	vpaddq	%zmm30,%zmm21,%zmm21
+
+	vpandq	%zmm28,%zmm25,%zmm24
+	vpsrlq	$44,%zmm25,%zmm25
+	vpsllq	$20,%zmm27,%zmm27
+	vpaddq	%zmm21,%zmm22,%zmm22
+
+	vpsrlq	$42,%zmm22,%zmm30
+	vpsllq	$10,%zmm23,%zmm23
+	vpandq	%zmm29,%zmm22,%zmm2
+	vpaddq	%zmm30,%zmm23,%zmm23
+
+	vpaddq	%zmm26,%zmm2,%zmm2
+	vpaddq	%zmm23,%zmm0,%zmm0
+	vpsllq	$2,%zmm23,%zmm23
+
+	vpaddq	%zmm23,%zmm0,%zmm0
+	vporq	%zmm27,%zmm25,%zmm25
+	vpandq	%zmm28,%zmm25,%zmm25
+
+	vpsrlq	$44,%zmm0,%zmm30
+	vpandq	%zmm28,%zmm0,%zmm0
+
+	vpaddq	%zmm30,%zmm1,%zmm1
+
+	subq	$8,%rdx
+	jnz	.Loop_vpmadd52_8x
+
+.Ltail_vpmadd52_8x:
+
+	vpaddq	%zmm24,%zmm0,%zmm0
+	vpaddq	%zmm25,%zmm1,%zmm1
+
+	vpxorq	%zmm18,%zmm18,%zmm18
+	vpmadd52luq	%zmm2,%zmm9,%zmm18
+	vpxorq	%zmm19,%zmm19,%zmm19
+	vpmadd52huq	%zmm2,%zmm9,%zmm19
+	vpxorq	%zmm20,%zmm20,%zmm20
+	vpmadd52luq	%zmm2,%zmm10,%zmm20
+	vpxorq	%zmm21,%zmm21,%zmm21
+	vpmadd52huq	%zmm2,%zmm10,%zmm21
+	vpxorq	%zmm22,%zmm22,%zmm22
+	vpmadd52luq	%zmm2,%zmm6,%zmm22
+	vpxorq	%zmm23,%zmm23,%zmm23
+	vpmadd52huq	%zmm2,%zmm6,%zmm23
+
+	vpmadd52luq	%zmm0,%zmm6,%zmm18
+	vpmadd52huq	%zmm0,%zmm6,%zmm19
+	vpmadd52luq	%zmm0,%zmm7,%zmm20
+	vpmadd52huq	%zmm0,%zmm7,%zmm21
+	vpmadd52luq	%zmm0,%zmm8,%zmm22
+	vpmadd52huq	%zmm0,%zmm8,%zmm23
+
+	vpmadd52luq	%zmm1,%zmm10,%zmm18
+	vpmadd52huq	%zmm1,%zmm10,%zmm19
+	vpmadd52luq	%zmm1,%zmm6,%zmm20
+	vpmadd52huq	%zmm1,%zmm6,%zmm21
+	vpmadd52luq	%zmm1,%zmm7,%zmm22
+	vpmadd52huq	%zmm1,%zmm7,%zmm23
+
+
+
+
+	movl	$1,%eax
+	kmovw	%eax,%k1
+	vpsrldq	$8,%zmm18,%zmm24
+	vpsrldq	$8,%zmm19,%zmm0
+	vpsrldq	$8,%zmm20,%zmm25
+	vpsrldq	$8,%zmm21,%zmm1
+	vpaddq	%zmm24,%zmm18,%zmm18
+	vpaddq	%zmm0,%zmm19,%zmm19
+	vpsrldq	$8,%zmm22,%zmm26
+	vpsrldq	$8,%zmm23,%zmm2
+	vpaddq	%zmm25,%zmm20,%zmm20
+	vpaddq	%zmm1,%zmm21,%zmm21
+	vpermq	$0x2,%zmm18,%zmm24
+	vpermq	$0x2,%zmm19,%zmm0
+	vpaddq	%zmm26,%zmm22,%zmm22
+	vpaddq	%zmm2,%zmm23,%zmm23
+
+	vpermq	$0x2,%zmm20,%zmm25
+	vpermq	$0x2,%zmm21,%zmm1
+	vpaddq	%zmm24,%zmm18,%zmm18
+	vpaddq	%zmm0,%zmm19,%zmm19
+	vpermq	$0x2,%zmm22,%zmm26
+	vpermq	$0x2,%zmm23,%zmm2
+	vpaddq	%zmm25,%zmm20,%zmm20
+	vpaddq	%zmm1,%zmm21,%zmm21
+	vextracti64x4	$1,%zmm18,%ymm24
+	vextracti64x4	$1,%zmm19,%ymm0
+	vpaddq	%zmm26,%zmm22,%zmm22
+	vpaddq	%zmm2,%zmm23,%zmm23
+
+	vextracti64x4	$1,%zmm20,%ymm25
+	vextracti64x4	$1,%zmm21,%ymm1
+	vextracti64x4	$1,%zmm22,%ymm26
+	vextracti64x4	$1,%zmm23,%ymm2
+	vpaddq	%ymm24,%ymm18,%ymm18{%k1}{z}
+	vpaddq	%ymm0,%ymm19,%ymm19{%k1}{z}
+	vpaddq	%ymm25,%ymm20,%ymm20{%k1}{z}
+	vpaddq	%ymm1,%ymm21,%ymm21{%k1}{z}
+	vpaddq	%ymm26,%ymm22,%ymm22{%k1}{z}
+	vpaddq	%ymm2,%ymm23,%ymm23{%k1}{z}
+
+
+
+	vpsrlq	$44,%ymm18,%ymm30
+	vpsllq	$8,%ymm19,%ymm19
+	vpandq	%ymm28,%ymm18,%ymm0
+	vpaddq	%ymm30,%ymm19,%ymm19
+
+	vpaddq	%ymm19,%ymm20,%ymm20
+
+	vpsrlq	$44,%ymm20,%ymm30
+	vpsllq	$8,%ymm21,%ymm21
+	vpandq	%ymm28,%ymm20,%ymm1
+	vpaddq	%ymm30,%ymm21,%ymm21
+
+	vpaddq	%ymm21,%ymm22,%ymm22
+
+	vpsrlq	$42,%ymm22,%ymm30
+	vpsllq	$10,%ymm23,%ymm23
+	vpandq	%ymm29,%ymm22,%ymm2
+	vpaddq	%ymm30,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm0,%ymm0
+	vpsllq	$2,%ymm23,%ymm23
+
+	vpaddq	%ymm23,%ymm0,%ymm0
+
+	vpsrlq	$44,%ymm0,%ymm30
+	vpandq	%ymm28,%ymm0,%ymm0
+
+	vpaddq	%ymm30,%ymm1,%ymm1
+
+
+
+	vmovq	%xmm0,0(%rdi)
+	vmovq	%xmm1,8(%rdi)
+	vmovq	%xmm2,16(%rdi)
+	vzeroall
+
+.Lno_data_vpmadd52_8x:
+	.byte	0xf3,0xc3
+.size	poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
+.type	poly1305_emit_base2_44,@function
+.align	32
+poly1305_emit_base2_44:
+	movq	0(%rdi),%r8
+	movq	8(%rdi),%r9
+	movq	16(%rdi),%r10
+
+	movq	%r9,%rax
+	shrq	$20,%r9
+	shlq	$44,%rax
+	movq	%r10,%rcx
+	shrq	$40,%r10
+	shlq	$24,%rcx
+
+	addq	%rax,%r8
+	adcq	%rcx,%r9
+	adcq	$0,%r10
+
+	movq	%r8,%rax
+	addq	$5,%r8
+	movq	%r9,%rcx
+	adcq	$0,%r9
+	adcq	$0,%r10
+	shrq	$2,%r10
+	cmovnzq	%r8,%rax
+	cmovnzq	%r9,%rcx
+
+	addq	0(%rdx),%rax
+	adcq	8(%rdx),%rcx
+	movq	%rax,0(%rsi)
+	movq	%rcx,8(%rsi)
+
+	.byte	0xf3,0xc3
+.size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
+.align	64
+.Lconst:
+.Lmask24:
+.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.L129:
+.long	16777216,0,16777216,0,16777216,0,16777216,0
+.Lmask26:
+.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.Lpermd_avx2:
+.long	2,2,2,3,2,0,2,1
+.Lpermd_avx512:
+.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.L2_44_inp_permd:
+.long	0,1,1,2,2,3,7,7
+.L2_44_inp_shift:
+.quad	0,12,24,64
+.L2_44_mask:
+.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+.L2_44_shift_rgt:
+.quad	44,44,42,64
+.L2_44_shift_lft:
+.quad	8,8,10,64
+
+.align	64
+.Lx_mask44:
+.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.Lx_mask42:
+.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	16
+.globl	xor128_encrypt_n_pad
+.type	xor128_encrypt_n_pad,@function
+.align	16
+xor128_encrypt_n_pad:
+	subq	%rdx,%rsi
+	subq	%rdx,%rdi
+	movq	%rcx,%r10
+	shrq	$4,%rcx
+	jz	.Ltail_enc
+	nop
+.Loop_enc_xmm:
+	movdqu	(%rsi,%rdx,1),%xmm0
+	pxor	(%rdx),%xmm0
+	movdqu	%xmm0,(%rdi,%rdx,1)
+	movdqa	%xmm0,(%rdx)
+	leaq	16(%rdx),%rdx
+	decq	%rcx
+	jnz	.Loop_enc_xmm
+
+	andq	$15,%r10
+	jz	.Ldone_enc
+
+.Ltail_enc:
+	movq	$16,%rcx
+	subq	%r10,%rcx
+	xorl	%eax,%eax
+.Loop_enc_byte:
+	movb	(%rsi,%rdx,1),%al
+	xorb	(%rdx),%al
+	movb	%al,(%rdi,%rdx,1)
+	movb	%al,(%rdx)
+	leaq	1(%rdx),%rdx
+	decq	%r10
+	jnz	.Loop_enc_byte
+
+	xorl	%eax,%eax
+.Loop_enc_pad:
+	movb	%al,(%rdx)
+	leaq	1(%rdx),%rdx
+	decq	%rcx
+	jnz	.Loop_enc_pad
+
+.Ldone_enc:
+	movq	%rdx,%rax
+	.byte	0xf3,0xc3
+.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
+
+.globl	xor128_decrypt_n_pad
+.type	xor128_decrypt_n_pad,@function
+.align	16
+xor128_decrypt_n_pad:
+	subq	%rdx,%rsi
+	subq	%rdx,%rdi
+	movq	%rcx,%r10
+	shrq	$4,%rcx
+	jz	.Ltail_dec
+	nop
+.Loop_dec_xmm:
+	movdqu	(%rsi,%rdx,1),%xmm0
+	movdqa	(%rdx),%xmm1
+	pxor	%xmm0,%xmm1
+	movdqu	%xmm1,(%rdi,%rdx,1)
+	movdqa	%xmm0,(%rdx)
+	leaq	16(%rdx),%rdx
+	decq	%rcx
+	jnz	.Loop_dec_xmm
+
+	pxor	%xmm1,%xmm1
+	andq	$15,%r10
+	jz	.Ldone_dec
+
+.Ltail_dec:
+	movq	$16,%rcx
+	subq	%r10,%rcx
+	xorl	%eax,%eax
+	xorq	%r11,%r11
+.Loop_dec_byte:
+	movb	(%rsi,%rdx,1),%r11b
+	movb	(%rdx),%al
+	xorb	%r11b,%al
+	movb	%al,(%rdi,%rdx,1)
+	movb	%r11b,(%rdx)
+	leaq	1(%rdx),%rdx
+	decq	%r10
+	jnz	.Loop_dec_byte
+
+	xorl	%eax,%eax
+.Loop_dec_pad:
+	movb	%al,(%rdx)
+	leaq	1(%rdx),%rdx
+	decq	%rcx
+	jnz	.Loop_dec_pad
+
+.Ldone_dec:
+	movq	%rdx,%rax
+	.byte	0xf3,0xc3
+.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad