diff mbox

[4/4] crypto: x86 - Add optimized MORUS implementations

Message ID 20180511121912.10219-5-omosnacek@gmail.com (mailing list archive)
State Accepted
Delegated to: Herbert Xu
Headers show

Commit Message

Ondrej Mosnáček May 11, 2018, 12:19 p.m. UTC
From: Ondrej Mosnacek <omosnacek@gmail.com>

This patch adds optimized implementations of MORUS-640 and MORUS-1280,
utilizing the SSE2 and AVX2 x86 extensions.

For MORUS-1280 (which operates on 256-bit blocks) we provide both AVX2
and SSE2 implementation. Although SSE2 MORUS-1280 is slower than AVX2
MORUS-1280, it is comparable in speed to the SSE2 MORUS-640.

Signed-off-by: Ondrej Mosnacek <omosnacek@gmail.com>
---
 arch/x86/crypto/Makefile              |  10 +
 arch/x86/crypto/morus1280-avx2-asm.S  | 621 ++++++++++++++++++
 arch/x86/crypto/morus1280-avx2-glue.c |  68 ++
 arch/x86/crypto/morus1280-sse2-asm.S  | 895 ++++++++++++++++++++++++++
 arch/x86/crypto/morus1280-sse2-glue.c |  68 ++
 arch/x86/crypto/morus640-sse2-asm.S   | 614 ++++++++++++++++++
 arch/x86/crypto/morus640-sse2-glue.c  |  68 ++
 crypto/Kconfig                        |  26 +
 8 files changed, 2370 insertions(+)
 create mode 100644 arch/x86/crypto/morus1280-avx2-asm.S
 create mode 100644 arch/x86/crypto/morus1280-avx2-glue.c
 create mode 100644 arch/x86/crypto/morus1280-sse2-asm.S
 create mode 100644 arch/x86/crypto/morus1280-sse2-glue.c
 create mode 100644 arch/x86/crypto/morus640-sse2-asm.S
 create mode 100644 arch/x86/crypto/morus640-sse2-glue.c
diff mbox

Patch

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 5f07333bb224..71fefe9da8d9 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -38,6 +38,9 @@  obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
 obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
 obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
 
+obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
+obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
+
 # These modules require assembler to support AVX.
 ifeq ($(avx_supported),yes)
 	obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
@@ -55,6 +58,8 @@  ifeq ($(avx2_supported),yes)
 	obj-$(CONFIG_CRYPTO_SHA1_MB) += sha1-mb/
 	obj-$(CONFIG_CRYPTO_SHA256_MB) += sha256-mb/
 	obj-$(CONFIG_CRYPTO_SHA512_MB) += sha512-mb/
+
+	obj-$(CONFIG_CRYPTO_MORUS1280_AVX2) += morus1280-avx2.o
 endif
 
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
@@ -72,6 +77,9 @@  salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
 chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
 serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
 
+morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
+morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
+
 ifeq ($(avx_supported),yes)
 	camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
 					camellia_aesni_avx_glue.o
@@ -87,6 +95,8 @@  ifeq ($(avx2_supported),yes)
 	camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
 	chacha20-x86_64-y += chacha20-avx2-x86_64.o
 	serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
+
+	morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
 endif
 
 aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
diff --git a/arch/x86/crypto/morus1280-avx2-asm.S b/arch/x86/crypto/morus1280-avx2-asm.S
new file mode 100644
index 000000000000..37d422e77931
--- /dev/null
+++ b/arch/x86/crypto/morus1280-avx2-asm.S
@@ -0,0 +1,621 @@ 
+/*
+ * AVX2 implementation of MORUS-1280
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define SHUFFLE_MASK(i0, i1, i2, i3) \
+	(i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
+
+#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
+#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
+#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
+
+#define STATE0		%ymm0
+#define STATE0_LOW	%xmm0
+#define STATE1		%ymm1
+#define STATE2		%ymm2
+#define STATE3		%ymm3
+#define STATE4		%ymm4
+#define KEY		%ymm5
+#define MSG		%ymm5
+#define MSG_LOW		%xmm5
+#define T0		%ymm6
+#define T0_LOW		%xmm6
+#define T1		%ymm7
+
+.section .rodata.cst32.morus1280_const, "aM", @progbits, 32
+.align 32
+.Lmorus1280_const:
+	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst32.morus1280_counter, "aM", @progbits, 32
+.align 32
+.Lmorus1280_counter:
+	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+	.byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+
+.text
+
+.macro morus1280_round s0, s1, s2, s3, s4, b, w
+	vpand \s1, \s2, T0
+	vpxor T0, \s0, \s0
+	vpxor \s3, \s0, \s0
+	vpsllq $\b, \s0, T0
+	vpsrlq $(64 - \b), \s0, \s0
+	vpxor T0, \s0, \s0
+	vpermq $\w, \s3, \s3
+.endm
+
+/*
+ * __morus1280_update: internal ABI
+ * input:
+ *   STATE[0-4] - input state
+ *   MSG        - message block
+ * output:
+ *   STATE[0-4] - output state
+ * changed:
+ *   T0
+ */
+__morus1280_update:
+	morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
+	vpxor MSG, STATE1, STATE1
+	morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
+	vpxor MSG, STATE2, STATE2
+	morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
+	vpxor MSG, STATE3, STATE3
+	morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2,  7, MASK2
+	vpxor MSG, STATE4, STATE4
+	morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3,  4, MASK1
+	ret
+ENDPROC(__morus1280_update)
+
+/*
+ * __morus1280_update_zero: internal ABI
+ * input:
+ *   STATE[0-4] - input state
+ * output:
+ *   STATE[0-4] - output state
+ * changed:
+ *   T0
+ */
+__morus1280_update_zero:
+	morus1280_round STATE0, STATE1, STATE2, STATE3, STATE4, 13, MASK1
+	morus1280_round STATE1, STATE2, STATE3, STATE4, STATE0, 46, MASK2
+	morus1280_round STATE2, STATE3, STATE4, STATE0, STATE1, 38, MASK3
+	morus1280_round STATE3, STATE4, STATE0, STATE1, STATE2,  7, MASK2
+	morus1280_round STATE4, STATE0, STATE1, STATE2, STATE3,  4, MASK1
+	ret
+ENDPROC(__morus1280_update_zero)
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ *   %rsi - src
+ *   %rcx - bytes
+ * output:
+ *   MSG  - message block
+ * changed:
+ *   %r8
+ *   %r9
+ */
+__load_partial:
+	xor %r9, %r9
+	vpxor MSG, MSG, MSG
+
+	mov %rcx, %r8
+	and $0x1, %r8
+	jz .Lld_partial_1
+
+	mov %rcx, %r8
+	and $0x1E, %r8
+	add %rsi, %r8
+	mov (%r8), %r9b
+
+.Lld_partial_1:
+	mov %rcx, %r8
+	and $0x2, %r8
+	jz .Lld_partial_2
+
+	mov %rcx, %r8
+	and $0x1C, %r8
+	add %rsi, %r8
+	shl $16, %r9
+	mov (%r8), %r9w
+
+.Lld_partial_2:
+	mov %rcx, %r8
+	and $0x4, %r8
+	jz .Lld_partial_4
+
+	mov %rcx, %r8
+	and $0x18, %r8
+	add %rsi, %r8
+	shl $32, %r9
+	mov (%r8), %r8d
+	xor %r8, %r9
+
+.Lld_partial_4:
+	movq %r9, MSG_LOW
+
+	mov %rcx, %r8
+	and $0x8, %r8
+	jz .Lld_partial_8
+
+	mov %rcx, %r8
+	and $0x10, %r8
+	add %rsi, %r8
+	pshufd $MASK2, MSG_LOW, MSG_LOW
+	pinsrq $0, (%r8), MSG_LOW
+
+.Lld_partial_8:
+	mov %rcx, %r8
+	and $0x10, %r8
+	jz .Lld_partial_16
+
+	vpermq $MASK2, MSG, MSG
+	movdqu (%rsi), MSG_LOW
+
+.Lld_partial_16:
+	ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ *   %rdx - dst
+ *   %rcx - bytes
+ * output:
+ *   T0   - message block
+ * changed:
+ *   %r8
+ *   %r9
+ *   %r10
+ */
+__store_partial:
+	mov %rcx, %r8
+	mov %rdx, %r9
+
+	cmp $16, %r8
+	jl .Lst_partial_16
+
+	movdqu T0_LOW, (%r9)
+	vpermq $MASK2, T0, T0
+
+	sub $16, %r8
+	add $16, %r9
+
+.Lst_partial_16:
+	movq T0_LOW, %r10
+
+	cmp $8, %r8
+	jl .Lst_partial_8
+
+	mov %r10, (%r9)
+	pextrq $1, T0_LOW, %r10
+
+	sub $8, %r8
+	add $8, %r9
+
+.Lst_partial_8:
+	cmp $4, %r8
+	jl .Lst_partial_4
+
+	mov %r10d, (%r9)
+	shr $32, %r10
+
+	sub $4, %r8
+	add $4, %r9
+
+.Lst_partial_4:
+	cmp $2, %r8
+	jl .Lst_partial_2
+
+	mov %r10w, (%r9)
+	shr $16, %r10
+
+	sub $2, %r8
+	add $2, %r9
+
+.Lst_partial_2:
+	cmp $1, %r8
+	jl .Lst_partial_1
+
+	mov %r10b, (%r9)
+
+.Lst_partial_1:
+	ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_morus1280_avx2_init(void *state, const void *key,
+ *                                 const void *iv);
+ */
+ENTRY(crypto_morus1280_avx2_init)
+	FRAME_BEGIN
+
+	/* load IV: */
+	vpxor STATE0, STATE0, STATE0
+	movdqu (%rdx), STATE0_LOW
+	/* load key: */
+	vmovdqu (%rsi), KEY
+	vmovdqa KEY, STATE1
+	/* load all ones: */
+	vpcmpeqd STATE2, STATE2, STATE2
+	/* load all zeros: */
+	vpxor STATE3, STATE3, STATE3
+	/* load the constant: */
+	vmovdqa .Lmorus1280_const, STATE4
+
+	/* update 16 times with zero: */
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+
+	/* xor-in the key again after updates: */
+	vpxor KEY, STATE1, STATE1
+
+	/* store the state: */
+	vmovdqu STATE0, (0 * 32)(%rdi)
+	vmovdqu STATE1, (1 * 32)(%rdi)
+	vmovdqu STATE2, (2 * 32)(%rdi)
+	vmovdqu STATE3, (3 * 32)(%rdi)
+	vmovdqu STATE4, (4 * 32)(%rdi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_avx2_init)
+
+/*
+ * void crypto_morus1280_avx2_ad(void *state, const void *data,
+ *                               unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_ad)
+	FRAME_BEGIN
+
+	cmp $32, %rdx
+	jb .Lad_out
+
+	/* load the state: */
+	vmovdqu (0 * 32)(%rdi), STATE0
+	vmovdqu (1 * 32)(%rdi), STATE1
+	vmovdqu (2 * 32)(%rdi), STATE2
+	vmovdqu (3 * 32)(%rdi), STATE3
+	vmovdqu (4 * 32)(%rdi), STATE4
+
+	mov %rsi,  %r8
+	and $0x1F, %r8
+	jnz .Lad_u_loop
+
+.align 4
+.Lad_a_loop:
+	vmovdqa (%rsi), MSG
+	call __morus1280_update
+	sub $32, %rdx
+	add $32, %rsi
+	cmp $32, %rdx
+	jge .Lad_a_loop
+
+	jmp .Lad_cont
+.align 4
+.Lad_u_loop:
+	vmovdqu (%rsi), MSG
+	call __morus1280_update
+	sub $32, %rdx
+	add $32, %rsi
+	cmp $32, %rdx
+	jge .Lad_u_loop
+
+.Lad_cont:
+	/* store the state: */
+	vmovdqu STATE0, (0 * 32)(%rdi)
+	vmovdqu STATE1, (1 * 32)(%rdi)
+	vmovdqu STATE2, (2 * 32)(%rdi)
+	vmovdqu STATE3, (3 * 32)(%rdi)
+	vmovdqu STATE4, (4 * 32)(%rdi)
+
+.Lad_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_avx2_ad)
+
+/*
+ * void crypto_morus1280_avx2_enc(void *state, const void *src, void *dst,
+ *                                unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_enc)
+	FRAME_BEGIN
+
+	cmp $32, %rcx
+	jb .Lenc_out
+
+	/* load the state: */
+	vmovdqu (0 * 32)(%rdi), STATE0
+	vmovdqu (1 * 32)(%rdi), STATE1
+	vmovdqu (2 * 32)(%rdi), STATE2
+	vmovdqu (3 * 32)(%rdi), STATE3
+	vmovdqu (4 * 32)(%rdi), STATE4
+
+	mov %rsi,  %r8
+	or  %rdx,  %r8
+	and $0x1F, %r8
+	jnz .Lenc_u_loop
+
+.align 4
+.Lenc_a_loop:
+	vmovdqa (%rsi), MSG
+	vmovdqa MSG, T0
+	vpxor STATE0, T0, T0
+	vpermq $MASK3, STATE1, T1
+	vpxor T1, T0, T0
+	vpand STATE2, STATE3, T1
+	vpxor T1, T0, T0
+	vmovdqa T0, (%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Lenc_a_loop
+
+	jmp .Lenc_cont
+.align 4
+.Lenc_u_loop:
+	vmovdqu (%rsi), MSG
+	vmovdqa MSG, T0
+	vpxor STATE0, T0, T0
+	vpermq $MASK3, STATE1, T1
+	vpxor T1, T0, T0
+	vpand STATE2, STATE3, T1
+	vpxor T1, T0, T0
+	vmovdqu T0, (%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Lenc_u_loop
+
+.Lenc_cont:
+	/* store the state: */
+	vmovdqu STATE0, (0 * 32)(%rdi)
+	vmovdqu STATE1, (1 * 32)(%rdi)
+	vmovdqu STATE2, (2 * 32)(%rdi)
+	vmovdqu STATE3, (3 * 32)(%rdi)
+	vmovdqu STATE4, (4 * 32)(%rdi)
+
+.Lenc_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_avx2_enc)
+
+/*
+ * void crypto_morus1280_avx2_enc_tail(void *state, const void *src, void *dst,
+ *                                     unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_enc_tail)
+	FRAME_BEGIN
+
+	/* load the state: */
+	vmovdqu (0 * 32)(%rdi), STATE0
+	vmovdqu (1 * 32)(%rdi), STATE1
+	vmovdqu (2 * 32)(%rdi), STATE2
+	vmovdqu (3 * 32)(%rdi), STATE3
+	vmovdqu (4 * 32)(%rdi), STATE4
+
+	/* encrypt message: */
+	call __load_partial
+
+	vmovdqa MSG, T0
+	vpxor STATE0, T0, T0
+	vpermq $MASK3, STATE1, T1
+	vpxor T1, T0, T0
+	vpand STATE2, STATE3, T1
+	vpxor T1, T0, T0
+
+	call __store_partial
+
+	call __morus1280_update
+
+	/* store the state: */
+	vmovdqu STATE0, (0 * 32)(%rdi)
+	vmovdqu STATE1, (1 * 32)(%rdi)
+	vmovdqu STATE2, (2 * 32)(%rdi)
+	vmovdqu STATE3, (3 * 32)(%rdi)
+	vmovdqu STATE4, (4 * 32)(%rdi)
+
+	FRAME_END
+ENDPROC(crypto_morus1280_avx2_enc_tail)
+
+/*
+ * void crypto_morus1280_avx2_dec(void *state, const void *src, void *dst,
+ *                                unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_dec)
+	FRAME_BEGIN
+
+	cmp $32, %rcx
+	jb .Ldec_out
+
+	/* load the state: */
+	vmovdqu (0 * 32)(%rdi), STATE0
+	vmovdqu (1 * 32)(%rdi), STATE1
+	vmovdqu (2 * 32)(%rdi), STATE2
+	vmovdqu (3 * 32)(%rdi), STATE3
+	vmovdqu (4 * 32)(%rdi), STATE4
+
+	mov %rsi,  %r8
+	or  %rdx,  %r8
+	and $0x1F, %r8
+	jnz .Ldec_u_loop
+
+.align 4
+.Ldec_a_loop:
+	vmovdqa (%rsi), MSG
+	vpxor STATE0, MSG, MSG
+	vpermq $MASK3, STATE1, T0
+	vpxor T0, MSG, MSG
+	vpand STATE2, STATE3, T0
+	vpxor T0, MSG, MSG
+	vmovdqa MSG, (%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Ldec_a_loop
+
+	jmp .Ldec_cont
+.align 4
+.Ldec_u_loop:
+	vmovdqu (%rsi), MSG
+	vpxor STATE0, MSG, MSG
+	vpermq $MASK3, STATE1, T0
+	vpxor T0, MSG, MSG
+	vpand STATE2, STATE3, T0
+	vpxor T0, MSG, MSG
+	vmovdqu MSG, (%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Ldec_u_loop
+
+.Ldec_cont:
+	/* store the state: */
+	vmovdqu STATE0, (0 * 32)(%rdi)
+	vmovdqu STATE1, (1 * 32)(%rdi)
+	vmovdqu STATE2, (2 * 32)(%rdi)
+	vmovdqu STATE3, (3 * 32)(%rdi)
+	vmovdqu STATE4, (4 * 32)(%rdi)
+
+.Ldec_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_avx2_dec)
+
+/*
+ * void crypto_morus1280_avx2_dec_tail(void *state, const void *src, void *dst,
+ *                                     unsigned int length);
+ */
+ENTRY(crypto_morus1280_avx2_dec_tail)
+	FRAME_BEGIN
+
+	/* load the state: */
+	vmovdqu (0 * 32)(%rdi), STATE0
+	vmovdqu (1 * 32)(%rdi), STATE1
+	vmovdqu (2 * 32)(%rdi), STATE2
+	vmovdqu (3 * 32)(%rdi), STATE3
+	vmovdqu (4 * 32)(%rdi), STATE4
+
+	/* decrypt message: */
+	call __load_partial
+
+	vpxor STATE0, MSG, MSG
+	vpermq $MASK3, STATE1, T0
+	vpxor T0, MSG, MSG
+	vpand STATE2, STATE3, T0
+	vpxor T0, MSG, MSG
+	vmovdqa MSG, T0
+
+	call __store_partial
+
+	/* mask with byte count: */
+	movq %rcx, T0_LOW
+	vpbroadcastb T0_LOW, T0
+	vmovdqa .Lmorus1280_counter, T1
+	vpcmpgtb T1, T0, T0
+	vpand T0, MSG, MSG
+
+	call __morus1280_update
+
+	/* store the state: */
+	vmovdqu STATE0, (0 * 32)(%rdi)
+	vmovdqu STATE1, (1 * 32)(%rdi)
+	vmovdqu STATE2, (2 * 32)(%rdi)
+	vmovdqu STATE3, (3 * 32)(%rdi)
+	vmovdqu STATE4, (4 * 32)(%rdi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_avx2_dec_tail)
+
+/*
+ * void crypto_morus1280_avx2_final(void *state, void *tag_xor,
+ *                                  u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_morus1280_avx2_final)
+	FRAME_BEGIN
+
+	/* load the state: */
+	vmovdqu (0 * 32)(%rdi), STATE0
+	vmovdqu (1 * 32)(%rdi), STATE1
+	vmovdqu (2 * 32)(%rdi), STATE2
+	vmovdqu (3 * 32)(%rdi), STATE3
+	vmovdqu (4 * 32)(%rdi), STATE4
+
+	/* xor state[0] into state[4]: */
+	vpxor STATE0, STATE4, STATE4
+
+	/* prepare length block: */
+	vpxor MSG, MSG, MSG
+	vpinsrq $0, %rdx, MSG_LOW, MSG_LOW
+	vpinsrq $1, %rcx, MSG_LOW, MSG_LOW
+	vpsllq $3, MSG, MSG /* multiply by 8 (to get bit count) */
+
+	/* update state: */
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+
+	/* xor tag: */
+	vmovdqu (%rsi), MSG
+
+	vpxor STATE0, MSG, MSG
+	vpermq $MASK3, STATE1, T0
+	vpxor T0, MSG, MSG
+	vpand STATE2, STATE3, T0
+	vpxor T0, MSG, MSG
+	vmovdqu MSG, (%rsi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_avx2_final)
diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c
new file mode 100644
index 000000000000..f111f36d26dc
--- /dev/null
+++ b/arch/x86/crypto/morus1280-avx2-glue.c
@@ -0,0 +1,68 @@ 
+/*
+ * The MORUS-1280 Authenticated-Encryption Algorithm
+ *   Glue for AVX2 implementation
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/internal/aead.h>
+#include <crypto/morus1280_glue.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage void crypto_morus1280_avx2_init(void *state, const void *key,
+					   const void *iv);
+asmlinkage void crypto_morus1280_avx2_ad(void *state, const void *data,
+					 unsigned int length);
+
+asmlinkage void crypto_morus1280_avx2_enc(void *state, const void *src,
+					  void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_avx2_dec(void *state, const void *src,
+					  void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_avx2_enc_tail(void *state, const void *src,
+					       void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_avx2_dec_tail(void *state, const void *src,
+					       void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor,
+					    u64 assoclen, u64 cryptlen);
+
+MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400);
+
+static const struct x86_cpu_id avx2_cpu_id[] = {
+    X86_FEATURE_MATCH(X86_FEATURE_AVX2),
+    {}
+};
+MODULE_DEVICE_TABLE(x86cpu, avx2_cpu_id);
+
+static int __init crypto_morus1280_avx2_module_init(void)
+{
+	if (!x86_match_cpu(avx2_cpu_id))
+		return -ENODEV;
+
+	return crypto_register_aeads(crypto_morus1280_avx2_algs,
+				     ARRAY_SIZE(crypto_morus1280_avx2_algs));
+}
+
+static void __exit crypto_morus1280_avx2_module_exit(void)
+{
+	crypto_unregister_aeads(crypto_morus1280_avx2_algs,
+				ARRAY_SIZE(crypto_morus1280_avx2_algs));
+}
+
+module_init(crypto_morus1280_avx2_module_init);
+module_exit(crypto_morus1280_avx2_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- AVX2 implementation");
+MODULE_ALIAS_CRYPTO("morus1280");
+MODULE_ALIAS_CRYPTO("morus1280-avx2");
diff --git a/arch/x86/crypto/morus1280-sse2-asm.S b/arch/x86/crypto/morus1280-sse2-asm.S
new file mode 100644
index 000000000000..1fe637c7be9d
--- /dev/null
+++ b/arch/x86/crypto/morus1280-sse2-asm.S
@@ -0,0 +1,895 @@ 
+/*
+ * SSE2 implementation of MORUS-1280
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define SHUFFLE_MASK(i0, i1, i2, i3) \
+	(i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
+
+#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
+
+#define STATE0_LO	%xmm0
+#define STATE0_HI	%xmm1
+#define STATE1_LO	%xmm2
+#define STATE1_HI	%xmm3
+#define STATE2_LO	%xmm4
+#define STATE2_HI	%xmm5
+#define STATE3_LO	%xmm6
+#define STATE3_HI	%xmm7
+#define STATE4_LO	%xmm8
+#define STATE4_HI	%xmm9
+#define KEY_LO		%xmm10
+#define KEY_HI		%xmm11
+#define MSG_LO		%xmm10
+#define MSG_HI		%xmm11
+#define T0_LO		%xmm12
+#define T0_HI		%xmm13
+#define T1_LO		%xmm14
+#define T1_HI		%xmm15
+
+.section .rodata.cst16.morus640_const, "aM", @progbits, 16
+.align 16
+.Lmorus640_const_0:
+	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Lmorus640_const_1:
+	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
+.align 16
+.Lmorus640_counter_0:
+	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+.Lmorus640_counter_1:
+	.byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+	.byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+
+.text
+
+.macro rol1 hi, lo
+	/*
+	 * HI_1 | HI_0 || LO_1 | LO_0
+	 *  ==>
+	 * HI_0 | HI_1 || LO_1 | LO_0
+	 *  ==>
+	 * HI_0 | LO_1 || LO_0 | HI_1
+	 */
+	pshufd $MASK2, \hi, \hi
+	movdqa \hi, T0_LO
+	punpcklqdq \lo, T0_LO
+	punpckhqdq \hi, \lo
+	movdqa \lo, \hi
+	movdqa T0_LO, \lo
+.endm
+
+.macro rol2 hi, lo
+	movdqa \lo, T0_LO
+	movdqa \hi, \lo
+	movdqa T0_LO, \hi
+.endm
+
+.macro rol3 hi, lo
+	/*
+	 * HI_1 | HI_0 || LO_1 | LO_0
+	 *  ==>
+	 * HI_0 | HI_1 || LO_1 | LO_0
+	 *  ==>
+	 * LO_0 | HI_1 || HI_0 | LO_1
+	 */
+	pshufd $MASK2, \hi, \hi
+	movdqa \lo, T0_LO
+	punpckhqdq \hi, T0_LO
+	punpcklqdq \lo, \hi
+	movdqa T0_LO, \lo
+.endm
+
+.macro morus1280_round s0_l, s0_h, s1_l, s1_h, s2_l, s2_h, s3_l, s3_h, s4_l, s4_h, b, w
+	movdqa \s1_l, T0_LO
+	pand \s2_l, T0_LO
+	pxor T0_LO, \s0_l
+
+	movdqa \s1_h, T0_LO
+	pand \s2_h, T0_LO
+	pxor T0_LO, \s0_h
+
+	pxor \s3_l, \s0_l
+	pxor \s3_h, \s0_h
+
+	movdqa \s0_l, T0_LO
+	psllq $\b, T0_LO
+	psrlq $(64 - \b), \s0_l
+	pxor T0_LO, \s0_l
+
+	movdqa \s0_h, T0_LO
+	psllq $\b, T0_LO
+	psrlq $(64 - \b), \s0_h
+	pxor T0_LO, \s0_h
+
+	\w \s3_h, \s3_l
+.endm
+
+/*
+ * __morus1280_update: internal ABI
+ * input:
+ *   STATE[0-4] - input state
+ *   MSG        - message block
+ * output:
+ *   STATE[0-4] - output state
+ * changed:
+ *   T0
+ */
+__morus1280_update:
+	morus1280_round \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		13, rol1
+	pxor MSG_LO, STATE1_LO
+	pxor MSG_HI, STATE1_HI
+	morus1280_round \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		46, rol2
+	pxor MSG_LO, STATE2_LO
+	pxor MSG_HI, STATE2_HI
+	morus1280_round \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		38, rol3
+	pxor MSG_LO, STATE3_LO
+	pxor MSG_HI, STATE3_HI
+	morus1280_round \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		7, rol2
+	pxor MSG_LO, STATE4_LO
+	pxor MSG_HI, STATE4_HI
+	morus1280_round \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		4, rol1
+	ret
+ENDPROC(__morus1280_update)
+
+/*
+ * __morus1280_update_zero: internal ABI
+ * input:
+ *   STATE[0-4] - input state
+ * output:
+ *   STATE[0-4] - output state
+ * changed:
+ *   T0
+ */
+__morus1280_update_zero:
+	morus1280_round \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		13, rol1
+	morus1280_round \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		46, rol2
+	morus1280_round \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		38, rol3
+	morus1280_round \
+		STATE3_LO, STATE3_HI, \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		7, rol2
+	morus1280_round \
+		STATE4_LO, STATE4_HI, \
+		STATE0_LO, STATE0_HI, \
+		STATE1_LO, STATE1_HI, \
+		STATE2_LO, STATE2_HI, \
+		STATE3_LO, STATE3_HI, \
+		4, rol1
+	ret
+ENDPROC(__morus1280_update_zero)
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ *   %rsi - src
+ *   %rcx - bytes
+ * output:
+ *   MSG  - message block
+ * changed:
+ *   %r8
+ *   %r9
+ */
+__load_partial:
+	xor %r9, %r9
+	pxor MSG_LO, MSG_LO
+	pxor MSG_HI, MSG_HI
+
+	mov %rcx, %r8
+	and $0x1, %r8
+	jz .Lld_partial_1
+
+	mov %rcx, %r8
+	and $0x1E, %r8
+	add %rsi, %r8
+	mov (%r8), %r9b
+
+.Lld_partial_1:
+	mov %rcx, %r8
+	and $0x2, %r8
+	jz .Lld_partial_2
+
+	mov %rcx, %r8
+	and $0x1C, %r8
+	add %rsi, %r8
+	shl $16, %r9
+	mov (%r8), %r9w
+
+.Lld_partial_2:
+	mov %rcx, %r8
+	and $0x4, %r8
+	jz .Lld_partial_4
+
+	mov %rcx, %r8
+	and $0x18, %r8
+	add %rsi, %r8
+	shl $32, %r9
+	mov (%r8), %r8d
+	xor %r8, %r9
+
+.Lld_partial_4:
+	movq %r9, MSG_LO
+
+	mov %rcx, %r8
+	and $0x8, %r8
+	jz .Lld_partial_8
+
+	mov %rcx, %r8
+	and $0x10, %r8
+	add %rsi, %r8
+	pslldq $8, MSG_LO
+	movq (%r8), T0_LO
+	pxor T0_LO, MSG_LO
+
+.Lld_partial_8:
+	mov %rcx, %r8
+	and $0x10, %r8
+	jz .Lld_partial_16
+
+	movdqa MSG_LO, MSG_HI
+	movdqu (%rsi), MSG_LO
+
+.Lld_partial_16:
+	ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ *   %rdx - dst
+ *   %rcx - bytes
+ * output:
+ *   T0   - message block
+ * changed:
+ *   %r8
+ *   %r9
+ *   %r10
+ */
+__store_partial:
+	mov %rcx, %r8
+	mov %rdx, %r9
+
+	cmp $16, %r8
+	jl .Lst_partial_16
+
+	movdqu T0_LO, (%r9)
+	movdqa T0_HI, T0_LO
+
+	sub $16, %r8
+	add $16, %r9
+
+.Lst_partial_16:
+	movq T0_LO, %r10
+
+	cmp $8, %r8
+	jl .Lst_partial_8
+
+	mov %r10, (%r9)
+	psrldq $8, T0_LO
+	movq T0_LO, %r10
+
+	sub $8, %r8
+	add $8, %r9
+
+.Lst_partial_8:
+	cmp $4, %r8
+	jl .Lst_partial_4
+
+	mov %r10d, (%r9)
+	shr $32, %r10
+
+	sub $4, %r8
+	add $4, %r9
+
+.Lst_partial_4:
+	cmp $2, %r8
+	jl .Lst_partial_2
+
+	mov %r10w, (%r9)
+	shr $16, %r10
+
+	sub $2, %r8
+	add $2, %r9
+
+.Lst_partial_2:
+	cmp $1, %r8
+	jl .Lst_partial_1
+
+	mov %r10b, (%r9)
+
+.Lst_partial_1:
+	ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_morus1280_sse2_init(void *state, const void *key,
+ *                                 const void *iv);
+ */
+ENTRY(crypto_morus1280_sse2_init)
+	FRAME_BEGIN
+
+	/* load IV: */
+	pxor STATE0_HI, STATE0_HI
+	movdqu (%rdx), STATE0_LO
+	/* load key: */
+	movdqu  0(%rsi), KEY_LO
+	movdqu 16(%rsi), KEY_HI
+	movdqa KEY_LO, STATE1_LO
+	movdqa KEY_HI, STATE1_HI
+	/* load all ones: */
+	pcmpeqd STATE2_LO, STATE2_LO
+	pcmpeqd STATE2_HI, STATE2_HI
+	/* load all zeros: */
+	pxor STATE3_LO, STATE3_LO
+	pxor STATE3_HI, STATE3_HI
+	/* load the constant: */
+	movdqa .Lmorus640_const_0, STATE4_LO
+	movdqa .Lmorus640_const_1, STATE4_HI
+
+	/* update 16 times with zero: */
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+	call __morus1280_update_zero
+
+	/* xor-in the key again after updates: */
+	pxor KEY_LO, STATE1_LO
+	pxor KEY_HI, STATE1_HI
+
+	/* store the state: */
+	movdqu STATE0_LO, (0 * 16)(%rdi)
+	movdqu STATE0_HI, (1 * 16)(%rdi)
+	movdqu STATE1_LO, (2 * 16)(%rdi)
+	movdqu STATE1_HI, (3 * 16)(%rdi)
+	movdqu STATE2_LO, (4 * 16)(%rdi)
+	movdqu STATE2_HI, (5 * 16)(%rdi)
+	movdqu STATE3_LO, (6 * 16)(%rdi)
+	movdqu STATE3_HI, (7 * 16)(%rdi)
+	movdqu STATE4_LO, (8 * 16)(%rdi)
+	movdqu STATE4_HI, (9 * 16)(%rdi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_sse2_init)
+
+/*
+ * void crypto_morus1280_sse2_ad(void *state, const void *data,
+ *                               unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_ad)
+	FRAME_BEGIN
+
+	cmp $32, %rdx
+	jb .Lad_out
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0_LO
+	movdqu (1 * 16)(%rdi), STATE0_HI
+	movdqu (2 * 16)(%rdi), STATE1_LO
+	movdqu (3 * 16)(%rdi), STATE1_HI
+	movdqu (4 * 16)(%rdi), STATE2_LO
+	movdqu (5 * 16)(%rdi), STATE2_HI
+	movdqu (6 * 16)(%rdi), STATE3_LO
+	movdqu (7 * 16)(%rdi), STATE3_HI
+	movdqu (8 * 16)(%rdi), STATE4_LO
+	movdqu (9 * 16)(%rdi), STATE4_HI
+
+	mov %rsi, %r8
+	and $0xF, %r8
+	jnz .Lad_u_loop
+
+.align 4
+.Lad_a_loop:
+	movdqa  0(%rsi), MSG_LO
+	movdqa 16(%rsi), MSG_HI
+	call __morus1280_update
+	sub $32, %rdx
+	add $32, %rsi
+	cmp $32, %rdx
+	jge .Lad_a_loop
+
+	jmp .Lad_cont
+.align 4
+.Lad_u_loop:
+	movdqu  0(%rsi), MSG_LO
+	movdqu 16(%rsi), MSG_HI
+	call __morus1280_update
+	sub $32, %rdx
+	add $32, %rsi
+	cmp $32, %rdx
+	jge .Lad_u_loop
+
+.Lad_cont:
+	/* store the state: */
+	movdqu STATE0_LO, (0 * 16)(%rdi)
+	movdqu STATE0_HI, (1 * 16)(%rdi)
+	movdqu STATE1_LO, (2 * 16)(%rdi)
+	movdqu STATE1_HI, (3 * 16)(%rdi)
+	movdqu STATE2_LO, (4 * 16)(%rdi)
+	movdqu STATE2_HI, (5 * 16)(%rdi)
+	movdqu STATE3_LO, (6 * 16)(%rdi)
+	movdqu STATE3_HI, (7 * 16)(%rdi)
+	movdqu STATE4_LO, (8 * 16)(%rdi)
+	movdqu STATE4_HI, (9 * 16)(%rdi)
+
+.Lad_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_sse2_ad)
+
+/*
+ * void crypto_morus1280_sse2_enc(void *state, const void *src, void *dst,
+ *                                unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_enc)
+	FRAME_BEGIN
+
+	cmp $32, %rcx
+	jb .Lenc_out
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0_LO
+	movdqu (1 * 16)(%rdi), STATE0_HI
+	movdqu (2 * 16)(%rdi), STATE1_LO
+	movdqu (3 * 16)(%rdi), STATE1_HI
+	movdqu (4 * 16)(%rdi), STATE2_LO
+	movdqu (5 * 16)(%rdi), STATE2_HI
+	movdqu (6 * 16)(%rdi), STATE3_LO
+	movdqu (7 * 16)(%rdi), STATE3_HI
+	movdqu (8 * 16)(%rdi), STATE4_LO
+	movdqu (9 * 16)(%rdi), STATE4_HI
+
+	mov %rsi, %r8
+	or  %rdx, %r8
+	and $0xF, %r8
+	jnz .Lenc_u_loop
+
+.align 4
+.Lenc_a_loop:
+	movdqa  0(%rsi), MSG_LO
+	movdqa 16(%rsi), MSG_HI
+	movdqa STATE1_LO, T1_LO
+	movdqa STATE1_HI, T1_HI
+	rol3 T1_HI, T1_LO
+	movdqa MSG_LO, T0_LO
+	movdqa MSG_HI, T0_HI
+	pxor T1_LO, T0_LO
+	pxor T1_HI, T0_HI
+	pxor STATE0_LO, T0_LO
+	pxor STATE0_HI, T0_HI
+	movdqa STATE2_LO, T1_LO
+	movdqa STATE2_HI, T1_HI
+	pand STATE3_LO, T1_LO
+	pand STATE3_HI, T1_HI
+	pxor T1_LO, T0_LO
+	pxor T1_HI, T0_HI
+	movdqa T0_LO,  0(%rdx)
+	movdqa T0_HI, 16(%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Lenc_a_loop
+
+	jmp .Lenc_cont
+.align 4
+.Lenc_u_loop:
+	movdqu  0(%rsi), MSG_LO
+	movdqu 16(%rsi), MSG_HI
+	movdqa STATE1_LO, T1_LO
+	movdqa STATE1_HI, T1_HI
+	rol3 T1_HI, T1_LO
+	movdqa MSG_LO, T0_LO
+	movdqa MSG_HI, T0_HI
+	pxor T1_LO, T0_LO
+	pxor T1_HI, T0_HI
+	pxor STATE0_LO, T0_LO
+	pxor STATE0_HI, T0_HI
+	movdqa STATE2_LO, T1_LO
+	movdqa STATE2_HI, T1_HI
+	pand STATE3_LO, T1_LO
+	pand STATE3_HI, T1_HI
+	pxor T1_LO, T0_LO
+	pxor T1_HI, T0_HI
+	movdqu T0_LO,  0(%rdx)
+	movdqu T0_HI, 16(%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Lenc_u_loop
+
+.Lenc_cont:
+	/* store the state: */
+	movdqu STATE0_LO, (0 * 16)(%rdi)
+	movdqu STATE0_HI, (1 * 16)(%rdi)
+	movdqu STATE1_LO, (2 * 16)(%rdi)
+	movdqu STATE1_HI, (3 * 16)(%rdi)
+	movdqu STATE2_LO, (4 * 16)(%rdi)
+	movdqu STATE2_HI, (5 * 16)(%rdi)
+	movdqu STATE3_LO, (6 * 16)(%rdi)
+	movdqu STATE3_HI, (7 * 16)(%rdi)
+	movdqu STATE4_LO, (8 * 16)(%rdi)
+	movdqu STATE4_HI, (9 * 16)(%rdi)
+
+.Lenc_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_sse2_enc)
+
+/*
+ * void crypto_morus1280_sse2_enc_tail(void *state, const void *src, void *dst,
+ *                                     unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_enc_tail)
+	FRAME_BEGIN
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0_LO
+	movdqu (1 * 16)(%rdi), STATE0_HI
+	movdqu (2 * 16)(%rdi), STATE1_LO
+	movdqu (3 * 16)(%rdi), STATE1_HI
+	movdqu (4 * 16)(%rdi), STATE2_LO
+	movdqu (5 * 16)(%rdi), STATE2_HI
+	movdqu (6 * 16)(%rdi), STATE3_LO
+	movdqu (7 * 16)(%rdi), STATE3_HI
+	movdqu (8 * 16)(%rdi), STATE4_LO
+	movdqu (9 * 16)(%rdi), STATE4_HI
+
+	/* encrypt message: */
+	call __load_partial
+
+	movdqa STATE1_LO, T1_LO
+	movdqa STATE1_HI, T1_HI
+	rol3 T1_HI, T1_LO
+	movdqa MSG_LO, T0_LO
+	movdqa MSG_HI, T0_HI
+	pxor T1_LO, T0_LO
+	pxor T1_HI, T0_HI
+	pxor STATE0_LO, T0_LO
+	pxor STATE0_HI, T0_HI
+	movdqa STATE2_LO, T1_LO
+	movdqa STATE2_HI, T1_HI
+	pand STATE3_LO, T1_LO
+	pand STATE3_HI, T1_HI
+	pxor T1_LO, T0_LO
+	pxor T1_HI, T0_HI
+
+	call __store_partial
+
+	call __morus1280_update
+
+	/* store the state: */
+	movdqu STATE0_LO, (0 * 16)(%rdi)
+	movdqu STATE0_HI, (1 * 16)(%rdi)
+	movdqu STATE1_LO, (2 * 16)(%rdi)
+	movdqu STATE1_HI, (3 * 16)(%rdi)
+	movdqu STATE2_LO, (4 * 16)(%rdi)
+	movdqu STATE2_HI, (5 * 16)(%rdi)
+	movdqu STATE3_LO, (6 * 16)(%rdi)
+	movdqu STATE3_HI, (7 * 16)(%rdi)
+	movdqu STATE4_LO, (8 * 16)(%rdi)
+	movdqu STATE4_HI, (9 * 16)(%rdi)
+
+	FRAME_END
+ENDPROC(crypto_morus1280_sse2_enc_tail)
+
+/*
+ * void crypto_morus1280_sse2_dec(void *state, const void *src, void *dst,
+ *                                unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_dec)
+	FRAME_BEGIN
+
+	cmp $32, %rcx
+	jb .Ldec_out
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0_LO
+	movdqu (1 * 16)(%rdi), STATE0_HI
+	movdqu (2 * 16)(%rdi), STATE1_LO
+	movdqu (3 * 16)(%rdi), STATE1_HI
+	movdqu (4 * 16)(%rdi), STATE2_LO
+	movdqu (5 * 16)(%rdi), STATE2_HI
+	movdqu (6 * 16)(%rdi), STATE3_LO
+	movdqu (7 * 16)(%rdi), STATE3_HI
+	movdqu (8 * 16)(%rdi), STATE4_LO
+	movdqu (9 * 16)(%rdi), STATE4_HI
+
+	mov %rsi, %r8
+	or  %rdx, %r8
+	and $0xF, %r8
+	jnz .Ldec_u_loop
+
+.align 4
+.Ldec_a_loop:
+	movdqa  0(%rsi), MSG_LO
+	movdqa 16(%rsi), MSG_HI
+	pxor STATE0_LO, MSG_LO
+	pxor STATE0_HI, MSG_HI
+	movdqa STATE1_LO, T1_LO
+	movdqa STATE1_HI, T1_HI
+	rol3 T1_HI, T1_LO
+	pxor T1_LO, MSG_LO
+	pxor T1_HI, MSG_HI
+	movdqa STATE2_LO, T1_LO
+	movdqa STATE2_HI, T1_HI
+	pand STATE3_LO, T1_LO
+	pand STATE3_HI, T1_HI
+	pxor T1_LO, MSG_LO
+	pxor T1_HI, MSG_HI
+	movdqa MSG_LO,  0(%rdx)
+	movdqa MSG_HI, 16(%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Ldec_a_loop
+
+	jmp .Ldec_cont
+.align 4
+.Ldec_u_loop:
+	movdqu  0(%rsi), MSG_LO
+	movdqu 16(%rsi), MSG_HI
+	pxor STATE0_LO, MSG_LO
+	pxor STATE0_HI, MSG_HI
+	movdqa STATE1_LO, T1_LO
+	movdqa STATE1_HI, T1_HI
+	rol3 T1_HI, T1_LO
+	pxor T1_LO, MSG_LO
+	pxor T1_HI, MSG_HI
+	movdqa STATE2_LO, T1_LO
+	movdqa STATE2_HI, T1_HI
+	pand STATE3_LO, T1_LO
+	pand STATE3_HI, T1_HI
+	pxor T1_LO, MSG_LO
+	pxor T1_HI, MSG_HI
+	movdqu MSG_LO,  0(%rdx)
+	movdqu MSG_HI, 16(%rdx)
+
+	call __morus1280_update
+	sub $32, %rcx
+	add $32, %rsi
+	add $32, %rdx
+	cmp $32, %rcx
+	jge .Ldec_u_loop
+
+.Ldec_cont:
+	/* store the state: */
+	movdqu STATE0_LO, (0 * 16)(%rdi)
+	movdqu STATE0_HI, (1 * 16)(%rdi)
+	movdqu STATE1_LO, (2 * 16)(%rdi)
+	movdqu STATE1_HI, (3 * 16)(%rdi)
+	movdqu STATE2_LO, (4 * 16)(%rdi)
+	movdqu STATE2_HI, (5 * 16)(%rdi)
+	movdqu STATE3_LO, (6 * 16)(%rdi)
+	movdqu STATE3_HI, (7 * 16)(%rdi)
+	movdqu STATE4_LO, (8 * 16)(%rdi)
+	movdqu STATE4_HI, (9 * 16)(%rdi)
+
+.Ldec_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_sse2_dec)
+
+/*
+ * void crypto_morus1280_sse2_dec_tail(void *state, const void *src, void *dst,
+ *                                     unsigned int length);
+ */
+ENTRY(crypto_morus1280_sse2_dec_tail)
+	FRAME_BEGIN
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0_LO
+	movdqu (1 * 16)(%rdi), STATE0_HI
+	movdqu (2 * 16)(%rdi), STATE1_LO
+	movdqu (3 * 16)(%rdi), STATE1_HI
+	movdqu (4 * 16)(%rdi), STATE2_LO
+	movdqu (5 * 16)(%rdi), STATE2_HI
+	movdqu (6 * 16)(%rdi), STATE3_LO
+	movdqu (7 * 16)(%rdi), STATE3_HI
+	movdqu (8 * 16)(%rdi), STATE4_LO
+	movdqu (9 * 16)(%rdi), STATE4_HI
+
+	/* decrypt message: */
+	call __load_partial
+
+	pxor STATE0_LO, MSG_LO
+	pxor STATE0_HI, MSG_HI
+	movdqa STATE1_LO, T1_LO
+	movdqa STATE1_HI, T1_HI
+	rol3 T1_HI, T1_LO
+	pxor T1_LO, MSG_LO
+	pxor T1_HI, MSG_HI
+	movdqa STATE2_LO, T1_LO
+	movdqa STATE2_HI, T1_HI
+	pand STATE3_LO, T1_LO
+	pand STATE3_HI, T1_HI
+	pxor T1_LO, MSG_LO
+	pxor T1_HI, MSG_HI
+	movdqa MSG_LO, T0_LO
+	movdqa MSG_HI, T0_HI
+
+	call __store_partial
+
+	/* mask with byte count: */
+	movq %rcx, T0_LO
+	punpcklbw T0_LO, T0_LO
+	punpcklbw T0_LO, T0_LO
+	punpcklbw T0_LO, T0_LO
+	punpcklbw T0_LO, T0_LO
+	movdqa T0_LO, T0_HI
+	movdqa .Lmorus640_counter_0, T1_LO
+	movdqa .Lmorus640_counter_1, T1_HI
+	pcmpgtb T1_LO, T0_LO
+	pcmpgtb T1_HI, T0_HI
+	pand T0_LO, MSG_LO
+	pand T0_HI, MSG_HI
+
+	call __morus1280_update
+
+	/* store the state: */
+	movdqu STATE0_LO, (0 * 16)(%rdi)
+	movdqu STATE0_HI, (1 * 16)(%rdi)
+	movdqu STATE1_LO, (2 * 16)(%rdi)
+	movdqu STATE1_HI, (3 * 16)(%rdi)
+	movdqu STATE2_LO, (4 * 16)(%rdi)
+	movdqu STATE2_HI, (5 * 16)(%rdi)
+	movdqu STATE3_LO, (6 * 16)(%rdi)
+	movdqu STATE3_HI, (7 * 16)(%rdi)
+	movdqu STATE4_LO, (8 * 16)(%rdi)
+	movdqu STATE4_HI, (9 * 16)(%rdi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_sse2_dec_tail)
+
+/*
+ * void crypto_morus1280_sse2_final(void *state, void *tag_xor,
+ *                                  u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_morus1280_sse2_final)
+	FRAME_BEGIN
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0_LO
+	movdqu (1 * 16)(%rdi), STATE0_HI
+	movdqu (2 * 16)(%rdi), STATE1_LO
+	movdqu (3 * 16)(%rdi), STATE1_HI
+	movdqu (4 * 16)(%rdi), STATE2_LO
+	movdqu (5 * 16)(%rdi), STATE2_HI
+	movdqu (6 * 16)(%rdi), STATE3_LO
+	movdqu (7 * 16)(%rdi), STATE3_HI
+	movdqu (8 * 16)(%rdi), STATE4_LO
+	movdqu (9 * 16)(%rdi), STATE4_HI
+
+	/* xor state[0] into state[4]: */
+	pxor STATE0_LO, STATE4_LO
+	pxor STATE0_HI, STATE4_HI
+
+	/* prepare length block: */
+	movq %rdx, MSG_LO
+	movq %rcx, T0_LO
+	pslldq $8, T0_LO
+	pxor T0_LO, MSG_LO
+	psllq $3, MSG_LO /* multiply by 8 (to get bit count) */
+	pxor MSG_HI, MSG_HI
+
+	/* update state: */
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+	call __morus1280_update
+
+	/* xor tag: */
+	movdqu  0(%rsi), MSG_LO
+	movdqu 16(%rsi), MSG_HI
+
+	pxor STATE0_LO, MSG_LO
+	pxor STATE0_HI, MSG_HI
+	movdqa STATE1_LO, T0_LO
+	movdqa STATE1_HI, T0_HI
+	rol3 T0_HI, T0_LO
+	pxor T0_LO, MSG_LO
+	pxor T0_HI, MSG_HI
+	movdqa STATE2_LO, T0_LO
+	movdqa STATE2_HI, T0_HI
+	pand STATE3_LO, T0_LO
+	pand STATE3_HI, T0_HI
+	pxor T0_LO, MSG_LO
+	pxor T0_HI, MSG_HI
+
+	movdqu MSG_LO,  0(%rsi)
+	movdqu MSG_HI, 16(%rsi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus1280_sse2_final)
diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c
new file mode 100644
index 000000000000..839270aa713c
--- /dev/null
+++ b/arch/x86/crypto/morus1280-sse2-glue.c
@@ -0,0 +1,68 @@ 
+/*
+ * The MORUS-1280 Authenticated-Encryption Algorithm
+ *   Glue for SSE2 implementation
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/internal/aead.h>
+#include <crypto/morus1280_glue.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage void crypto_morus1280_sse2_init(void *state, const void *key,
+					   const void *iv);
+asmlinkage void crypto_morus1280_sse2_ad(void *state, const void *data,
+					 unsigned int length);
+
+asmlinkage void crypto_morus1280_sse2_enc(void *state, const void *src,
+					  void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_sse2_dec(void *state, const void *src,
+					  void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_sse2_enc_tail(void *state, const void *src,
+					       void *dst, unsigned int length);
+asmlinkage void crypto_morus1280_sse2_dec_tail(void *state, const void *src,
+					       void *dst, unsigned int length);
+
+asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor,
+					    u64 assoclen, u64 cryptlen);
+
+MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350);
+
+static const struct x86_cpu_id sse2_cpu_id[] = {
+    X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+    {}
+};
+MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id);
+
+static int __init crypto_morus1280_sse2_module_init(void)
+{
+	if (!x86_match_cpu(sse2_cpu_id))
+		return -ENODEV;
+
+	return crypto_register_aeads(crypto_morus1280_sse2_algs,
+				     ARRAY_SIZE(crypto_morus1280_sse2_algs));
+}
+
+static void __exit crypto_morus1280_sse2_module_exit(void)
+{
+	crypto_unregister_aeads(crypto_morus1280_sse2_algs,
+				ARRAY_SIZE(crypto_morus1280_sse2_algs));
+}
+
+module_init(crypto_morus1280_sse2_module_init);
+module_exit(crypto_morus1280_sse2_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-1280 AEAD algorithm -- SSE2 implementation");
+MODULE_ALIAS_CRYPTO("morus1280");
+MODULE_ALIAS_CRYPTO("morus1280-sse2");
diff --git a/arch/x86/crypto/morus640-sse2-asm.S b/arch/x86/crypto/morus640-sse2-asm.S
new file mode 100644
index 000000000000..71c72a0a0862
--- /dev/null
+++ b/arch/x86/crypto/morus640-sse2-asm.S
@@ -0,0 +1,614 @@ 
+/*
+ * SSE2 implementation of MORUS-640
+ *
+ * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/frame.h>
+
+#define SHUFFLE_MASK(i0, i1, i2, i3) \
+	(i0 | (i1 << 2) | (i2 << 4) | (i3 << 6))
+
+#define MASK1 SHUFFLE_MASK(3, 0, 1, 2)
+#define MASK2 SHUFFLE_MASK(2, 3, 0, 1)
+#define MASK3 SHUFFLE_MASK(1, 2, 3, 0)
+
+#define STATE0	%xmm0
+#define STATE1	%xmm1
+#define STATE2	%xmm2
+#define STATE3	%xmm3
+#define STATE4	%xmm4
+#define KEY	%xmm5
+#define MSG	%xmm5
+#define T0	%xmm6
+#define T1	%xmm7
+
+.section .rodata.cst16.morus640_const, "aM", @progbits, 32
+.align 16
+.Lmorus640_const_0:
+	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
+	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
+.Lmorus640_const_1:
+	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
+	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd
+
+.section .rodata.cst16.morus640_counter, "aM", @progbits, 16
+.align 16
+.Lmorus640_counter:
+	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+
+.text
+
+.macro morus640_round s0, s1, s2, s3, s4, b, w
+	movdqa \s1, T0
+	pand \s2, T0
+	pxor T0, \s0
+	pxor \s3, \s0
+	movdqa \s0, T0
+	pslld $\b, T0
+	psrld $(32 - \b), \s0
+	pxor T0, \s0
+	pshufd $\w, \s3, \s3
+.endm
+
+/*
+ * __morus640_update: internal ABI
+ * input:
+ *   STATE[0-4] - input state
+ *   MSG        - message block
+ * output:
+ *   STATE[0-4] - output state
+ * changed:
+ *   T0
+ */
+__morus640_update:
+	morus640_round STATE0, STATE1, STATE2, STATE3, STATE4,  5, MASK1
+	pxor MSG, STATE1
+	morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
+	pxor MSG, STATE2
+	morus640_round STATE2, STATE3, STATE4, STATE0, STATE1,  7, MASK3
+	pxor MSG, STATE3
+	morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
+	pxor MSG, STATE4
+	morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
+	ret
+ENDPROC(__morus640_update)
+
+
+/*
+ * __morus640_update_zero: internal ABI
+ * input:
+ *   STATE[0-4] - input state
+ * output:
+ *   STATE[0-4] - output state
+ * changed:
+ *   T0
+ */
+__morus640_update_zero:
+	morus640_round STATE0, STATE1, STATE2, STATE3, STATE4,  5, MASK1
+	morus640_round STATE1, STATE2, STATE3, STATE4, STATE0, 31, MASK2
+	morus640_round STATE2, STATE3, STATE4, STATE0, STATE1,  7, MASK3
+	morus640_round STATE3, STATE4, STATE0, STATE1, STATE2, 22, MASK2
+	morus640_round STATE4, STATE0, STATE1, STATE2, STATE3, 13, MASK1
+	ret
+ENDPROC(__morus640_update_zero)
+
+/*
+ * __load_partial: internal ABI
+ * input:
+ *   %rsi - src
+ *   %rcx - bytes
+ * output:
+ *   MSG  - message block
+ * changed:
+ *   T0
+ *   %r8
+ *   %r9
+ */
+__load_partial:
+	xor %r9, %r9
+	pxor MSG, MSG
+
+	mov %rcx, %r8
+	and $0x1, %r8
+	jz .Lld_partial_1
+
+	mov %rcx, %r8
+	and $0x1E, %r8
+	add %rsi, %r8
+	mov (%r8), %r9b
+
+.Lld_partial_1:
+	mov %rcx, %r8
+	and $0x2, %r8
+	jz .Lld_partial_2
+
+	mov %rcx, %r8
+	and $0x1C, %r8
+	add %rsi, %r8
+	shl $16, %r9
+	mov (%r8), %r9w
+
+.Lld_partial_2:
+	mov %rcx, %r8
+	and $0x4, %r8
+	jz .Lld_partial_4
+
+	mov %rcx, %r8
+	and $0x18, %r8
+	add %rsi, %r8
+	shl $32, %r9
+	mov (%r8), %r8d
+	xor %r8, %r9
+
+.Lld_partial_4:
+	movq %r9, MSG
+
+	mov %rcx, %r8
+	and $0x8, %r8
+	jz .Lld_partial_8
+
+	mov %rcx, %r8
+	and $0x10, %r8
+	add %rsi, %r8
+	pslldq $8, MSG
+	movq (%r8), T0
+	pxor T0, MSG
+
+.Lld_partial_8:
+	ret
+ENDPROC(__load_partial)
+
+/*
+ * __store_partial: internal ABI
+ * input:
+ *   %rdx - dst
+ *   %rcx - bytes
+ * output:
+ *   T0   - message block
+ * changed:
+ *   %r8
+ *   %r9
+ *   %r10
+ */
+__store_partial:
+	mov %rcx, %r8
+	mov %rdx, %r9
+
+	movq T0, %r10
+
+	cmp $8, %r8
+	jl .Lst_partial_8
+
+	mov %r10, (%r9)
+	psrldq $8, T0
+	movq T0, %r10
+
+	sub $8, %r8
+	add $8, %r9
+
+.Lst_partial_8:
+	cmp $4, %r8
+	jl .Lst_partial_4
+
+	mov %r10d, (%r9)
+	shr $32, %r10
+
+	sub $4, %r8
+	add $4, %r9
+
+.Lst_partial_4:
+	cmp $2, %r8
+	jl .Lst_partial_2
+
+	mov %r10w, (%r9)
+	shr $16, %r10
+
+	sub $2, %r8
+	add $2, %r9
+
+.Lst_partial_2:
+	cmp $1, %r8
+	jl .Lst_partial_1
+
+	mov %r10b, (%r9)
+
+.Lst_partial_1:
+	ret
+ENDPROC(__store_partial)
+
+/*
+ * void crypto_morus640_sse2_init(void *state, const void *key, const void *iv);
+ */
+ENTRY(crypto_morus640_sse2_init)
+	FRAME_BEGIN
+
+	/* load IV: */
+	movdqu (%rdx), STATE0
+	/* load key: */
+	movdqu (%rsi), KEY
+	movdqa KEY, STATE1
+	/* load all ones: */
+	pcmpeqd STATE2, STATE2
+	/* load the constants: */
+	movdqa .Lmorus640_const_0, STATE3
+	movdqa .Lmorus640_const_1, STATE4
+
+	/* update 16 times with zero: */
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+	call __morus640_update_zero
+
+	/* xor-in the key again after updates: */
+	pxor KEY, STATE1
+
+	/* store the state: */
+	movdqu STATE0, (0 * 16)(%rdi)
+	movdqu STATE1, (1 * 16)(%rdi)
+	movdqu STATE2, (2 * 16)(%rdi)
+	movdqu STATE3, (3 * 16)(%rdi)
+	movdqu STATE4, (4 * 16)(%rdi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus640_sse2_init)
+
+/*
+ * void crypto_morus640_sse2_ad(void *state, const void *data,
+ *                              unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_ad)
+	FRAME_BEGIN
+
+	cmp $16, %rdx
+	jb .Lad_out
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0
+	movdqu (1 * 16)(%rdi), STATE1
+	movdqu (2 * 16)(%rdi), STATE2
+	movdqu (3 * 16)(%rdi), STATE3
+	movdqu (4 * 16)(%rdi), STATE4
+
+	mov %rsi, %r8
+	and $0xF, %r8
+	jnz .Lad_u_loop
+
+.align 4
+.Lad_a_loop:
+	movdqa (%rsi), MSG
+	call __morus640_update
+	sub $16, %rdx
+	add $16, %rsi
+	cmp $16, %rdx
+	jge .Lad_a_loop
+
+	jmp .Lad_cont
+.align 4
+.Lad_u_loop:
+	movdqu (%rsi), MSG
+	call __morus640_update
+	sub $16, %rdx
+	add $16, %rsi
+	cmp $16, %rdx
+	jge .Lad_u_loop
+
+.Lad_cont:
+	/* store the state: */
+	movdqu STATE0, (0 * 16)(%rdi)
+	movdqu STATE1, (1 * 16)(%rdi)
+	movdqu STATE2, (2 * 16)(%rdi)
+	movdqu STATE3, (3 * 16)(%rdi)
+	movdqu STATE4, (4 * 16)(%rdi)
+
+.Lad_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus640_sse2_ad)
+
+/*
+ * void crypto_morus640_sse2_enc(void *state, const void *src, void *dst,
+ *                               unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_enc)
+	FRAME_BEGIN
+
+	cmp $16, %rcx
+	jb .Lenc_out
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0
+	movdqu (1 * 16)(%rdi), STATE1
+	movdqu (2 * 16)(%rdi), STATE2
+	movdqu (3 * 16)(%rdi), STATE3
+	movdqu (4 * 16)(%rdi), STATE4
+
+	mov %rsi, %r8
+	or  %rdx, %r8
+	and $0xF, %r8
+	jnz .Lenc_u_loop
+
+.align 4
+.Lenc_a_loop:
+	movdqa (%rsi), MSG
+	movdqa MSG, T0
+	pxor STATE0, T0
+	pshufd $MASK3, STATE1, T1
+	pxor T1, T0
+	movdqa STATE2, T1
+	pand STATE3, T1
+	pxor T1, T0
+	movdqa T0, (%rdx)
+
+	call __morus640_update
+	sub $16, %rcx
+	add $16, %rsi
+	add $16, %rdx
+	cmp $16, %rcx
+	jge .Lenc_a_loop
+
+	jmp .Lenc_cont
+.align 4
+.Lenc_u_loop:
+	movdqu (%rsi), MSG
+	movdqa MSG, T0
+	pxor STATE0, T0
+	pshufd $MASK3, STATE1, T1
+	pxor T1, T0
+	movdqa STATE2, T1
+	pand STATE3, T1
+	pxor T1, T0
+	movdqu T0, (%rdx)
+
+	call __morus640_update
+	sub $16, %rcx
+	add $16, %rsi
+	add $16, %rdx
+	cmp $16, %rcx
+	jge .Lenc_u_loop
+
+.Lenc_cont:
+	/* store the state: */
+	movdqu STATE0, (0 * 16)(%rdi)
+	movdqu STATE1, (1 * 16)(%rdi)
+	movdqu STATE2, (2 * 16)(%rdi)
+	movdqu STATE3, (3 * 16)(%rdi)
+	movdqu STATE4, (4 * 16)(%rdi)
+
+.Lenc_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus640_sse2_enc)
+
+/*
+ * void crypto_morus640_sse2_enc_tail(void *state, const void *src, void *dst,
+ *                                    unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_enc_tail)
+	FRAME_BEGIN
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0
+	movdqu (1 * 16)(%rdi), STATE1
+	movdqu (2 * 16)(%rdi), STATE2
+	movdqu (3 * 16)(%rdi), STATE3
+	movdqu (4 * 16)(%rdi), STATE4
+
+	/* encrypt message: */
+	call __load_partial
+
+	movdqa MSG, T0
+	pxor STATE0, T0
+	pshufd $MASK3, STATE1, T1
+	pxor T1, T0
+	movdqa STATE2, T1
+	pand STATE3, T1
+	pxor T1, T0
+
+	call __store_partial
+
+	call __morus640_update
+
+	/* store the state: */
+	movdqu STATE0, (0 * 16)(%rdi)
+	movdqu STATE1, (1 * 16)(%rdi)
+	movdqu STATE2, (2 * 16)(%rdi)
+	movdqu STATE3, (3 * 16)(%rdi)
+	movdqu STATE4, (4 * 16)(%rdi)
+
+	FRAME_END
+ENDPROC(crypto_morus640_sse2_enc_tail)
+
+/*
+ * void crypto_morus640_sse2_dec(void *state, const void *src, void *dst,
+ *                               unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_dec)
+	FRAME_BEGIN
+
+	cmp $16, %rcx
+	jb .Ldec_out
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0
+	movdqu (1 * 16)(%rdi), STATE1
+	movdqu (2 * 16)(%rdi), STATE2
+	movdqu (3 * 16)(%rdi), STATE3
+	movdqu (4 * 16)(%rdi), STATE4
+
+	mov %rsi, %r8
+	or  %rdx, %r8
+	and $0xF, %r8
+	jnz .Ldec_u_loop
+
+.align 4
+.Ldec_a_loop:
+	movdqa (%rsi), MSG
+	pxor STATE0, MSG
+	pshufd $MASK3, STATE1, T0
+	pxor T0, MSG
+	movdqa STATE2, T0
+	pand STATE3, T0
+	pxor T0, MSG
+	movdqa MSG, (%rdx)
+
+	call __morus640_update
+	sub $16, %rcx
+	add $16, %rsi
+	add $16, %rdx
+	cmp $16, %rcx
+	jge .Ldec_a_loop
+
+	jmp .Ldec_cont
+.align 4
+.Ldec_u_loop:
+	movdqu (%rsi), MSG
+	pxor STATE0, MSG
+	pshufd $MASK3, STATE1, T0
+	pxor T0, MSG
+	movdqa STATE2, T0
+	pand STATE3, T0
+	pxor T0, MSG
+	movdqu MSG, (%rdx)
+
+	call __morus640_update
+	sub $16, %rcx
+	add $16, %rsi
+	add $16, %rdx
+	cmp $16, %rcx
+	jge .Ldec_u_loop
+
+.Ldec_cont:
+	/* store the state: */
+	movdqu STATE0, (0 * 16)(%rdi)
+	movdqu STATE1, (1 * 16)(%rdi)
+	movdqu STATE2, (2 * 16)(%rdi)
+	movdqu STATE3, (3 * 16)(%rdi)
+	movdqu STATE4, (4 * 16)(%rdi)
+
+.Ldec_out:
+	FRAME_END
+	ret
+ENDPROC(crypto_morus640_sse2_dec)
+
+/*
+ * void crypto_morus640_sse2_dec_tail(void *state, const void *src, void *dst,
+ *                                    unsigned int length);
+ */
+ENTRY(crypto_morus640_sse2_dec_tail)
+	FRAME_BEGIN
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0
+	movdqu (1 * 16)(%rdi), STATE1
+	movdqu (2 * 16)(%rdi), STATE2
+	movdqu (3 * 16)(%rdi), STATE3
+	movdqu (4 * 16)(%rdi), STATE4
+
+	/* decrypt message: */
+	call __load_partial
+
+	pxor STATE0, MSG
+	pshufd $MASK3, STATE1, T0
+	pxor T0, MSG
+	movdqa STATE2, T0
+	pand STATE3, T0
+	pxor T0, MSG
+	movdqa MSG, T0
+
+	call __store_partial
+
+	/* mask with byte count: */
+	movq %rcx, T0
+	punpcklbw T0, T0
+	punpcklbw T0, T0
+	punpcklbw T0, T0
+	punpcklbw T0, T0
+	movdqa .Lmorus640_counter, T1
+	pcmpgtb T1, T0
+	pand T0, MSG
+
+	call __morus640_update
+
+	/* store the state: */
+	movdqu STATE0, (0 * 16)(%rdi)
+	movdqu STATE1, (1 * 16)(%rdi)
+	movdqu STATE2, (2 * 16)(%rdi)
+	movdqu STATE3, (3 * 16)(%rdi)
+	movdqu STATE4, (4 * 16)(%rdi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus640_sse2_dec_tail)
+
+/*
+ * void crypto_morus640_sse2_final(void *state, void *tag_xor,
+ *	                           u64 assoclen, u64 cryptlen);
+ */
+ENTRY(crypto_morus640_sse2_final)
+	FRAME_BEGIN
+
+	/* load the state: */
+	movdqu (0 * 16)(%rdi), STATE0
+	movdqu (1 * 16)(%rdi), STATE1
+	movdqu (2 * 16)(%rdi), STATE2
+	movdqu (3 * 16)(%rdi), STATE3
+	movdqu (4 * 16)(%rdi), STATE4
+
+	/* xor state[0] into state[4]: */
+	pxor STATE0, STATE4
+
+	/* prepare length block: */
+	movq %rdx, MSG
+	movq %rcx, T0
+	pslldq $8, T0
+	pxor T0, MSG
+	psllq $3, MSG /* multiply by 8 (to get bit count) */
+
+	/* update state: */
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+	call __morus640_update
+
+	/* xor tag: */
+	movdqu (%rsi), MSG
+
+	pxor STATE0, MSG
+	pshufd $MASK3, STATE1, T0
+	pxor T0, MSG
+	movdqa STATE2, T0
+	pand STATE3, T0
+	pxor T0, MSG
+
+	movdqu MSG, (%rsi)
+
+	FRAME_END
+	ret
+ENDPROC(crypto_morus640_sse2_final)
diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c
new file mode 100644
index 000000000000..26b47e2db8d2
--- /dev/null
+++ b/arch/x86/crypto/morus640-sse2-glue.c
@@ -0,0 +1,68 @@ 
+/*
+ * The MORUS-640 Authenticated-Encryption Algorithm
+ *   Glue for SSE2 implementation
+ *
+ * Copyright (c) 2016-2018 Ondrej Mosnacek <omosnacek@gmail.com>
+ * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <crypto/internal/aead.h>
+#include <crypto/morus640_glue.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/cpu_device_id.h>
+
+asmlinkage void crypto_morus640_sse2_init(void *state, const void *key,
+					  const void *iv);
+asmlinkage void crypto_morus640_sse2_ad(void *state, const void *data,
+					unsigned int length);
+
+asmlinkage void crypto_morus640_sse2_enc(void *state, const void *src,
+					 void *dst, unsigned int length);
+asmlinkage void crypto_morus640_sse2_dec(void *state, const void *src,
+					 void *dst, unsigned int length);
+
+asmlinkage void crypto_morus640_sse2_enc_tail(void *state, const void *src,
+					      void *dst, unsigned int length);
+asmlinkage void crypto_morus640_sse2_dec_tail(void *state, const void *src,
+					      void *dst, unsigned int length);
+
+asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor,
+					   u64 assoclen, u64 cryptlen);
+
+MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400);
+
+static const struct x86_cpu_id sse2_cpu_id[] = {
+    X86_FEATURE_MATCH(X86_FEATURE_XMM2),
+    {}
+};
+MODULE_DEVICE_TABLE(x86cpu, sse2_cpu_id);
+
+static int __init crypto_morus640_sse2_module_init(void)
+{
+	if (!x86_match_cpu(sse2_cpu_id))
+		return -ENODEV;
+
+	return crypto_register_aeads(crypto_morus640_sse2_algs,
+				     ARRAY_SIZE(crypto_morus640_sse2_algs));
+}
+
+static void __exit crypto_morus640_sse2_module_exit(void)
+{
+	crypto_unregister_aeads(crypto_morus640_sse2_algs,
+				ARRAY_SIZE(crypto_morus640_sse2_algs));
+}
+
+module_init(crypto_morus640_sse2_module_init);
+module_exit(crypto_morus640_sse2_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>");
+MODULE_DESCRIPTION("MORUS-640 AEAD algorithm -- SSE2 implementation");
+MODULE_ALIAS_CRYPTO("morus640");
+MODULE_ALIAS_CRYPTO("morus640-sse2");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 34c18e242e31..e95b1ae4fc3c 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -303,6 +303,14 @@  config CRYPTO_MORUS640_GLUE
 	  Common glue for SIMD optimizations of the MORUS-640 dedicated AEAD
 	  algorithm.
 
+config CRYPTO_MORUS640_SSE2
+	tristate "MORUS-640 AEAD algorithm (x86_64 SSE2 implementation)"
+	depends on X86 && 64BIT
+	select CRYPTO_AEAD
+	select CRYPTO_MORUS640_GLUE
+	help
+	  SSE2 implementation of the MORUS-640 dedicated AEAD algorithm.
+
 config CRYPTO_MORUS1280
 	tristate "MORUS-1280 AEAD algorithm"
 	select CRYPTO_AEAD
@@ -317,6 +325,24 @@  config CRYPTO_MORUS1280_GLUE
 	  Common glue for SIMD optimizations of the MORUS-1280 dedicated AEAD
 	  algorithm.
 
+config CRYPTO_MORUS1280_SSE2
+	tristate "MORUS-1280 AEAD algorithm (x86_64 SSE2 implementation)"
+	depends on X86 && 64BIT
+	select CRYPTO_AEAD
+	select CRYPTO_MORUS1280_GLUE
+	help
+	  SSE2 optimizedimplementation of the MORUS-1280 dedicated AEAD
+	  algorithm.
+
+config CRYPTO_MORUS1280_AVX2
+	tristate "MORUS-1280 AEAD algorithm (x86_64 AVX2 implementation)"
+	depends on X86 && 64BIT
+	select CRYPTO_AEAD
+	select CRYPTO_MORUS1280_GLUE
+	help
+	  AVX2 optimized implementation of the MORUS-1280 dedicated AEAD
+	  algorithm.
+
 config CRYPTO_SEQIV
 	tristate "Sequence Number IV Generator"
 	select CRYPTO_AEAD