diff mbox series

[v2,2/2] crypto: x86/aes-ni-xts - rewrite and drop indirections via glue helper

Message ID 20201231164155.21792-3-ardb@kernel.org (mailing list archive)
State Accepted
Delegated to: Herbert Xu
Headers show
Series [v2,1/2] crypto: x86/aes-ni-xts - use direct calls to and 4-way stride | expand

Commit Message

Ard Biesheuvel Dec. 31, 2020, 4:41 p.m. UTC
The AES-NI driver implements XTS via the glue helper, which consumes
a struct with sets of function pointers which are invoked on chunks
of input data of the appropriate size, as annotated in the struct.

Let's get rid of this indirection, so that we can perform direct calls
to the assembler helpers. Instead, let's adopt the arm64 strategy, i.e.,
provide a helper which can consume inputs of any size, provided that the
penultimate, full block is passed via the last call if ciphertext stealing
needs to be applied.

This also allows us to enable the XTS mode for i386.

Tested-by: Eric Biggers <ebiggers@google.com> # x86_64
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/x86/crypto/aesni-intel_asm.S  | 280 ++++++++++++++++----
 arch/x86/crypto/aesni-intel_glue.c | 220 ++++++++-------
 crypto/Kconfig                     |   1 -
 3 files changed, 356 insertions(+), 145 deletions(-)

Comments

kernel test robot Dec. 31, 2020, 8:43 p.m. UTC | #1
Hi Ard,

I love your patch! Yet something to improve:

[auto build test ERROR on cryptodev/master]
[also build test ERROR on crypto/master linus/master v5.11-rc1 next-20201223]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Ard-Biesheuvel/crypto-x86-aes-ni-xts-recover-and-improve-performance/20210101-004902
base:   https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git master
config: i386-randconfig-c001-20201231 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/120e62f276c7436572e8a67ecfb9bbb1125bfd8d
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Ard-Biesheuvel/crypto-x86-aes-ni-xts-recover-and-improve-performance/20210101-004902
        git checkout 120e62f276c7436572e8a67ecfb9bbb1125bfd8d
        # save the attached .config to linux build tree
        make W=1 ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   ld: arch/x86/crypto/aesni-intel_asm.o: in function `aesni_xts_encrypt':
>> arch/x86/crypto/aesni-intel_asm.S:2844: undefined reference to `.Lcts_permute_table'
   ld: arch/x86/crypto/aesni-intel_asm.o: in function `aesni_xts_decrypt':
   arch/x86/crypto/aesni-intel_asm.S:3006: undefined reference to `.Lcts_permute_table'


vim +2844 arch/x86/crypto/aesni-intel_asm.S

  2702	
  2703	/*
  2704	 * _aesni_gf128mul_x_ble:		internal ABI
  2705	 *	Multiply in GF(2^128) for XTS IVs
  2706	 * input:
  2707	 *	IV:	current IV
  2708	 *	GF128MUL_MASK == mask with 0x87 and 0x01
  2709	 * output:
  2710	 *	IV:	next IV
  2711	 * changed:
  2712	 *	CTR:	== temporary value
  2713	 */
  2714	#define _aesni_gf128mul_x_ble() \
  2715		pshufd $0x13, IV, KEY; \
  2716		paddq IV, IV; \
  2717		psrad $31, KEY; \
  2718		pand GF128MUL_MASK, KEY; \
  2719		pxor KEY, IV;
  2720	
  2721	/*
  2722	 * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
  2723	 *			  const u8 *src, unsigned int len, le128 *iv)
  2724	 */
  2725	SYM_FUNC_START(aesni_xts_encrypt)
  2726		FRAME_BEGIN
  2727	#ifndef __x86_64__
  2728		pushl IVP
  2729		pushl LEN
  2730		pushl KEYP
  2731		pushl KLEN
  2732		movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
  2733		movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
  2734		movl (FRAME_OFFSET+28)(%esp), INP	# src
  2735		movl (FRAME_OFFSET+32)(%esp), LEN	# len
  2736		movl (FRAME_OFFSET+36)(%esp), IVP	# iv
  2737		movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
  2738	#else
  2739		movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
  2740	#endif
  2741		movups (IVP), IV
  2742	
  2743		mov 480(KEYP), KLEN
  2744	
  2745	.Lxts_enc_loop4:
  2746		sub $64, LEN
  2747		jl .Lxts_enc_1x
  2748	
  2749		movdqa IV, STATE1
  2750		movdqu 0x00(INP), IN
  2751		pxor IN, STATE1
  2752		movdqu IV, 0x00(OUTP)
  2753	
  2754		_aesni_gf128mul_x_ble()
  2755		movdqa IV, STATE2
  2756		movdqu 0x10(INP), IN
  2757		pxor IN, STATE2
  2758		movdqu IV, 0x10(OUTP)
  2759	
  2760		_aesni_gf128mul_x_ble()
  2761		movdqa IV, STATE3
  2762		movdqu 0x20(INP), IN
  2763		pxor IN, STATE3
  2764		movdqu IV, 0x20(OUTP)
  2765	
  2766		_aesni_gf128mul_x_ble()
  2767		movdqa IV, STATE4
  2768		movdqu 0x30(INP), IN
  2769		pxor IN, STATE4
  2770		movdqu IV, 0x30(OUTP)
  2771	
  2772		call _aesni_enc4
  2773	
  2774		movdqu 0x00(OUTP), IN
  2775		pxor IN, STATE1
  2776		movdqu STATE1, 0x00(OUTP)
  2777	
  2778		movdqu 0x10(OUTP), IN
  2779		pxor IN, STATE2
  2780		movdqu STATE2, 0x10(OUTP)
  2781	
  2782		movdqu 0x20(OUTP), IN
  2783		pxor IN, STATE3
  2784		movdqu STATE3, 0x20(OUTP)
  2785	
  2786		movdqu 0x30(OUTP), IN
  2787		pxor IN, STATE4
  2788		movdqu STATE4, 0x30(OUTP)
  2789	
  2790		_aesni_gf128mul_x_ble()
  2791	
  2792		add $64, INP
  2793		add $64, OUTP
  2794		test LEN, LEN
  2795		jnz .Lxts_enc_loop4
  2796	
  2797	.Lxts_enc_ret_iv:
  2798		movups IV, (IVP)
  2799	
  2800	.Lxts_enc_ret:
  2801	#ifndef __x86_64__
  2802		popl KLEN
  2803		popl KEYP
  2804		popl LEN
  2805		popl IVP
  2806	#endif
  2807		FRAME_END
  2808		ret
  2809	
  2810	.Lxts_enc_1x:
  2811		add $64, LEN
  2812		jz .Lxts_enc_ret_iv
  2813		sub $16, LEN
  2814		jl .Lxts_enc_cts4
  2815	
  2816	.Lxts_enc_loop1:
  2817		movdqu (INP), STATE
  2818		pxor IV, STATE
  2819		call _aesni_enc1
  2820		pxor IV, STATE
  2821		_aesni_gf128mul_x_ble()
  2822	
  2823		test LEN, LEN
  2824		jz .Lxts_enc_out
  2825	
  2826		add $16, INP
  2827		sub $16, LEN
  2828		jl .Lxts_enc_cts1
  2829	
  2830		movdqu STATE, (OUTP)
  2831		add $16, OUTP
  2832		jmp .Lxts_enc_loop1
  2833	
  2834	.Lxts_enc_out:
  2835		movdqu STATE, (OUTP)
  2836		jmp .Lxts_enc_ret_iv
  2837	
  2838	.Lxts_enc_cts4:
  2839		movdqa STATE4, STATE
  2840		sub $16, OUTP
  2841	
  2842	.Lxts_enc_cts1:
  2843	#ifndef __x86_64__
> 2844		lea .Lcts_permute_table, T1
  2845	#else
  2846		lea .Lcts_permute_table(%rip), T1
  2847	#endif
  2848		add LEN, INP		/* rewind input pointer */
  2849		add $16, LEN		/* # bytes in final block */
  2850		movups (INP), IN1
  2851	
  2852		mov T1, IVP
  2853		add $32, IVP
  2854		add LEN, T1
  2855		sub LEN, IVP
  2856		add OUTP, LEN
  2857	
  2858		movups (T1), %xmm4
  2859		movaps STATE, IN2
  2860		pshufb %xmm4, STATE
  2861		movups STATE, (LEN)
  2862	
  2863		movups (IVP), %xmm0
  2864		pshufb %xmm0, IN1
  2865		pblendvb IN2, IN1
  2866		movaps IN1, STATE
  2867	
  2868		pxor IV, STATE
  2869		call _aesni_enc1
  2870		pxor IV, STATE
  2871	
  2872		movups STATE, (OUTP)
  2873		jmp .Lxts_enc_ret
  2874	SYM_FUNC_END(aesni_xts_encrypt)
  2875	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot Dec. 31, 2020, 9:46 p.m. UTC | #2
Hi Ard,

I love your patch! Yet something to improve:

[auto build test ERROR on cryptodev/master]
[also build test ERROR on crypto/master linus/master v5.11-rc1 next-20201223]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Ard-Biesheuvel/crypto-x86-aes-ni-xts-recover-and-improve-performance/20210101-004902
base:   https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git master
config: x86_64-randconfig-a002-20201231 (attached as .config)
compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project 6b316febb4388764789677f81f03aff373ec35b2)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install x86_64 cross compiling tool for clang build
        # apt-get install binutils-x86-64-linux-gnu
        # https://github.com/0day-ci/linux/commit/120e62f276c7436572e8a67ecfb9bbb1125bfd8d
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Ard-Biesheuvel/crypto-x86-aes-ni-xts-recover-and-improve-performance/20210101-004902
        git checkout 120e62f276c7436572e8a67ecfb9bbb1125bfd8d
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

>> ld.lld: error: undefined symbol: .Lcts_permute_table
   >>> referenced by aesni-intel_asm.S:2846 (arch/x86/crypto/aesni-intel_asm.S:2846)
   >>>               crypto/aesni-intel_asm.o:(aesni_xts_encrypt) in archive arch/x86/built-in.a
   >>> referenced by aesni-intel_asm.S:3008 (arch/x86/crypto/aesni-intel_asm.S:3008)
   >>>               crypto/aesni-intel_asm.o:(aesni_xts_decrypt) in archive arch/x86/built-in.a

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
kernel test robot Dec. 31, 2020, 10:37 p.m. UTC | #3
Hi Ard,

I love your patch! Yet something to improve:

[auto build test ERROR on cryptodev/master]
[also build test ERROR on crypto/master linus/master v5.11-rc1 next-20201223]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Ard-Biesheuvel/crypto-x86-aes-ni-xts-recover-and-improve-performance/20210101-004902
base:   https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git master
config: x86_64-allyesconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce (this is a W=1 build):
        # https://github.com/0day-ci/linux/commit/120e62f276c7436572e8a67ecfb9bbb1125bfd8d
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Ard-Biesheuvel/crypto-x86-aes-ni-xts-recover-and-improve-performance/20210101-004902
        git checkout 120e62f276c7436572e8a67ecfb9bbb1125bfd8d
        # save the attached .config to linux build tree
        make W=1 ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   ld: arch/x86/crypto/aesni-intel_asm.o: in function `aesni_xts_encrypt':
>> (.text+0x8909): undefined reference to `.Lcts_permute_table'
   ld: arch/x86/crypto/aesni-intel_asm.o: in function `aesni_xts_decrypt':
   (.text+0x8af6): undefined reference to `.Lcts_permute_table'

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Ard Biesheuvel Jan. 3, 2021, 8:31 p.m. UTC | #4
On Thu, 31 Dec 2020 at 23:37, kernel test robot <lkp@intel.com> wrote:
>
> Hi Ard,
>
> I love your patch! Yet something to improve:
>
> [auto build test ERROR on cryptodev/master]
> [also build test ERROR on crypto/master linus/master v5.11-rc1 next-20201223]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch]
>

This is a false positive, and the cover letter mentions that these
patches depend on the cts(cbc(aes)) patch which is now in the
cryptodev tree

I will try to remember to use --base next time.


> url:    https://github.com/0day-ci/linux/commits/Ard-Biesheuvel/crypto-x86-aes-ni-xts-recover-and-improve-performance/20210101-004902
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git master
> config: x86_64-allyesconfig (attached as .config)
> compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
> reproduce (this is a W=1 build):
>         # https://github.com/0day-ci/linux/commit/120e62f276c7436572e8a67ecfb9bbb1125bfd8d
>         git remote add linux-review https://github.com/0day-ci/linux
>         git fetch --no-tags linux-review Ard-Biesheuvel/crypto-x86-aes-ni-xts-recover-and-improve-performance/20210101-004902
>         git checkout 120e62f276c7436572e8a67ecfb9bbb1125bfd8d
>         # save the attached .config to linux build tree
>         make W=1 ARCH=x86_64
>
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
>
> All errors (new ones prefixed by >>):
>
>    ld: arch/x86/crypto/aesni-intel_asm.o: in function `aesni_xts_encrypt':
> >> (.text+0x8909): undefined reference to `.Lcts_permute_table'
>    ld: arch/x86/crypto/aesni-intel_asm.o: in function `aesni_xts_decrypt':
>    (.text+0x8af6): undefined reference to `.Lcts_permute_table'
>
> ---
> 0-DAY CI Kernel Test Service, Intel Corporation
> https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Chen, Rong A Jan. 4, 2021, 5:44 a.m. UTC | #5
On 1/4/2021 4:31 AM, Ard Biesheuvel wrote:
> On Thu, 31 Dec 2020 at 23:37, kernel test robot <lkp@intel.com> wrote:
>>
>> Hi Ard,
>>
>> I love your patch! Yet something to improve:
>>
>> [auto build test ERROR on cryptodev/master]
>> [also build test ERROR on crypto/master linus/master v5.11-rc1 next-20201223]
>> [If your patch is applied to the wrong git tree, kindly drop us a note.
>> And when submitting patch, we suggest to use '--base' as documented in
>> https://git-scm.com/docs/git-format-patch]
>>
> 
> This is a false positive, and the cover letter mentions that these
> patches depend on the cts(cbc(aes)) patch which is now in the
> cryptodev tree

Hi Ard,

Thanks for the clarification,the bot doesn't support analyzing the base
patch from cover letter yet.

> 
> I will try to remember to use --base next time.

Thanks a lot!

Best Regards,
Rong Chen

> 
> 
>> url:    https://github.com/0day-ci/linux/commits/Ard-Biesheuvel/crypto-x86-aes-ni-xts-recover-and-improve-performance/20210101-004902
>> base:   https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git master
>> config: x86_64-allyesconfig (attached as .config)
>> compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
>> reproduce (this is a W=1 build):
>>          # https://github.com/0day-ci/linux/commit/120e62f276c7436572e8a67ecfb9bbb1125bfd8d
>>          git remote add linux-review https://github.com/0day-ci/linux
>>          git fetch --no-tags linux-review Ard-Biesheuvel/crypto-x86-aes-ni-xts-recover-and-improve-performance/20210101-004902
>>          git checkout 120e62f276c7436572e8a67ecfb9bbb1125bfd8d
>>          # save the attached .config to linux build tree
>>          make W=1 ARCH=x86_64
>>
>> If you fix the issue, kindly add following tag as appropriate
>> Reported-by: kernel test robot <lkp@intel.com>
>>
>> All errors (new ones prefixed by >>):
>>
>>     ld: arch/x86/crypto/aesni-intel_asm.o: in function `aesni_xts_encrypt':
>>>> (.text+0x8909): undefined reference to `.Lcts_permute_table'
>>     ld: arch/x86/crypto/aesni-intel_asm.o: in function `aesni_xts_decrypt':
>>     (.text+0x8af6): undefined reference to `.Lcts_permute_table'
>>
>> ---
>> 0-DAY CI Kernel Test Service, Intel Corporation
>> https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
> _______________________________________________
> kbuild-all mailing list -- kbuild-all@lists.01.org
> To unsubscribe send an email to kbuild-all-leave@lists.01.org
>
diff mbox series

Patch

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 84d8a156cdcd..4e3972570916 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -43,10 +43,6 @@ 
 #ifdef __x86_64__
 
 # constants in mergeable sections, linker can reorder and merge
-.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
-.align 16
-.Lgf128mul_x_ble_mask:
-	.octa 0x00000000000000010000000000000087
 .section	.rodata.cst16.POLY, "aM", @progbits, 16
 .align 16
 POLY:   .octa 0xC2000000000000000000000000000001
@@ -146,7 +142,7 @@  ALL_F:      .octa 0xffffffffffffffffffffffffffffffff
 #define CTR	%xmm11
 #define INC	%xmm12
 
-#define GF128MUL_MASK %xmm10
+#define GF128MUL_MASK %xmm7
 
 #ifdef __x86_64__
 #define AREG	%rax
@@ -2823,6 +2819,14 @@  SYM_FUNC_START(aesni_ctr_enc)
 	ret
 SYM_FUNC_END(aesni_ctr_enc)
 
+#endif
+
+.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
+.align 16
+.Lgf128mul_x_ble_mask:
+	.octa 0x00000000000000010000000000000087
+.previous
+
 /*
  * _aesni_gf128mul_x_ble:		internal ABI
  *	Multiply in GF(2^128) for XTS IVs
@@ -2835,11 +2839,11 @@  SYM_FUNC_END(aesni_ctr_enc)
  *	CTR:	== temporary value
  */
 #define _aesni_gf128mul_x_ble() \
-	pshufd $0x13, IV, CTR; \
+	pshufd $0x13, IV, KEY; \
 	paddq IV, IV; \
-	psrad $31, CTR; \
-	pand GF128MUL_MASK, CTR; \
-	pxor CTR, IV;
+	psrad $31, KEY; \
+	pand GF128MUL_MASK, KEY; \
+	pxor KEY, IV;
 
 /*
  * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
@@ -2847,65 +2851,153 @@  SYM_FUNC_END(aesni_ctr_enc)
  */
 SYM_FUNC_START(aesni_xts_encrypt)
 	FRAME_BEGIN
-
+#ifndef __x86_64__
+	pushl IVP
+	pushl LEN
+	pushl KEYP
+	pushl KLEN
+	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
+	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
+	movl (FRAME_OFFSET+28)(%esp), INP	# src
+	movl (FRAME_OFFSET+32)(%esp), LEN	# len
+	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
 	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+#else
+	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
+#endif
 	movups (IVP), IV
 
 	mov 480(KEYP), KLEN
 
 .Lxts_enc_loop4:
+	sub $64, LEN
+	jl .Lxts_enc_1x
+
 	movdqa IV, STATE1
-	movdqu 0x00(INP), INC
-	pxor INC, STATE1
+	movdqu 0x00(INP), IN
+	pxor IN, STATE1
 	movdqu IV, 0x00(OUTP)
 
 	_aesni_gf128mul_x_ble()
 	movdqa IV, STATE2
-	movdqu 0x10(INP), INC
-	pxor INC, STATE2
+	movdqu 0x10(INP), IN
+	pxor IN, STATE2
 	movdqu IV, 0x10(OUTP)
 
 	_aesni_gf128mul_x_ble()
 	movdqa IV, STATE3
-	movdqu 0x20(INP), INC
-	pxor INC, STATE3
+	movdqu 0x20(INP), IN
+	pxor IN, STATE3
 	movdqu IV, 0x20(OUTP)
 
 	_aesni_gf128mul_x_ble()
 	movdqa IV, STATE4
-	movdqu 0x30(INP), INC
-	pxor INC, STATE4
+	movdqu 0x30(INP), IN
+	pxor IN, STATE4
 	movdqu IV, 0x30(OUTP)
 
 	call _aesni_enc4
 
-	movdqu 0x00(OUTP), INC
-	pxor INC, STATE1
+	movdqu 0x00(OUTP), IN
+	pxor IN, STATE1
 	movdqu STATE1, 0x00(OUTP)
 
-	movdqu 0x10(OUTP), INC
-	pxor INC, STATE2
+	movdqu 0x10(OUTP), IN
+	pxor IN, STATE2
 	movdqu STATE2, 0x10(OUTP)
 
-	movdqu 0x20(OUTP), INC
-	pxor INC, STATE3
+	movdqu 0x20(OUTP), IN
+	pxor IN, STATE3
 	movdqu STATE3, 0x20(OUTP)
 
-	movdqu 0x30(OUTP), INC
-	pxor INC, STATE4
+	movdqu 0x30(OUTP), IN
+	pxor IN, STATE4
 	movdqu STATE4, 0x30(OUTP)
 
 	_aesni_gf128mul_x_ble()
 
 	add $64, INP
 	add $64, OUTP
-	sub $64, LEN
-	ja .Lxts_enc_loop4
+	test LEN, LEN
+	jnz .Lxts_enc_loop4
 
+.Lxts_enc_ret_iv:
 	movups IV, (IVP)
 
+.Lxts_enc_ret:
+#ifndef __x86_64__
+	popl KLEN
+	popl KEYP
+	popl LEN
+	popl IVP
+#endif
 	FRAME_END
 	ret
+
+.Lxts_enc_1x:
+	add $64, LEN
+	jz .Lxts_enc_ret_iv
+	sub $16, LEN
+	jl .Lxts_enc_cts4
+
+.Lxts_enc_loop1:
+	movdqu (INP), STATE
+	pxor IV, STATE
+	call _aesni_enc1
+	pxor IV, STATE
+	_aesni_gf128mul_x_ble()
+
+	test LEN, LEN
+	jz .Lxts_enc_out
+
+	add $16, INP
+	sub $16, LEN
+	jl .Lxts_enc_cts1
+
+	movdqu STATE, (OUTP)
+	add $16, OUTP
+	jmp .Lxts_enc_loop1
+
+.Lxts_enc_out:
+	movdqu STATE, (OUTP)
+	jmp .Lxts_enc_ret_iv
+
+.Lxts_enc_cts4:
+	movdqa STATE4, STATE
+	sub $16, OUTP
+
+.Lxts_enc_cts1:
+#ifndef __x86_64__
+	lea .Lcts_permute_table, T1
+#else
+	lea .Lcts_permute_table(%rip), T1
+#endif
+	add LEN, INP		/* rewind input pointer */
+	add $16, LEN		/* # bytes in final block */
+	movups (INP), IN1
+
+	mov T1, IVP
+	add $32, IVP
+	add LEN, T1
+	sub LEN, IVP
+	add OUTP, LEN
+
+	movups (T1), %xmm4
+	movaps STATE, IN2
+	pshufb %xmm4, STATE
+	movups STATE, (LEN)
+
+	movups (IVP), %xmm0
+	pshufb %xmm0, IN1
+	pblendvb IN2, IN1
+	movaps IN1, STATE
+
+	pxor IV, STATE
+	call _aesni_enc1
+	pxor IV, STATE
+
+	movups STATE, (OUTP)
+	jmp .Lxts_enc_ret
 SYM_FUNC_END(aesni_xts_encrypt)
 
 /*
@@ -2914,66 +3006,158 @@  SYM_FUNC_END(aesni_xts_encrypt)
  */
 SYM_FUNC_START(aesni_xts_decrypt)
 	FRAME_BEGIN
-
+#ifndef __x86_64__
+	pushl IVP
+	pushl LEN
+	pushl KEYP
+	pushl KLEN
+	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
+	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
+	movl (FRAME_OFFSET+28)(%esp), INP	# src
+	movl (FRAME_OFFSET+32)(%esp), LEN	# len
+	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
 	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
+#else
+	movdqa .Lgf128mul_x_ble_mask(%rip), GF128MUL_MASK
+#endif
 	movups (IVP), IV
 
 	mov 480(KEYP), KLEN
 	add $240, KEYP
 
+	test $15, LEN
+	jz .Lxts_dec_loop4
+	sub $16, LEN
+
 .Lxts_dec_loop4:
+	sub $64, LEN
+	jl .Lxts_dec_1x
+
 	movdqa IV, STATE1
-	movdqu 0x00(INP), INC
-	pxor INC, STATE1
+	movdqu 0x00(INP), IN
+	pxor IN, STATE1
 	movdqu IV, 0x00(OUTP)
 
 	_aesni_gf128mul_x_ble()
 	movdqa IV, STATE2
-	movdqu 0x10(INP), INC
-	pxor INC, STATE2
+	movdqu 0x10(INP), IN
+	pxor IN, STATE2
 	movdqu IV, 0x10(OUTP)
 
 	_aesni_gf128mul_x_ble()
 	movdqa IV, STATE3
-	movdqu 0x20(INP), INC
-	pxor INC, STATE3
+	movdqu 0x20(INP), IN
+	pxor IN, STATE3
 	movdqu IV, 0x20(OUTP)
 
 	_aesni_gf128mul_x_ble()
 	movdqa IV, STATE4
-	movdqu 0x30(INP), INC
-	pxor INC, STATE4
+	movdqu 0x30(INP), IN
+	pxor IN, STATE4
 	movdqu IV, 0x30(OUTP)
 
 	call _aesni_dec4
 
-	movdqu 0x00(OUTP), INC
-	pxor INC, STATE1
+	movdqu 0x00(OUTP), IN
+	pxor IN, STATE1
 	movdqu STATE1, 0x00(OUTP)
 
-	movdqu 0x10(OUTP), INC
-	pxor INC, STATE2
+	movdqu 0x10(OUTP), IN
+	pxor IN, STATE2
 	movdqu STATE2, 0x10(OUTP)
 
-	movdqu 0x20(OUTP), INC
-	pxor INC, STATE3
+	movdqu 0x20(OUTP), IN
+	pxor IN, STATE3
 	movdqu STATE3, 0x20(OUTP)
 
-	movdqu 0x30(OUTP), INC
-	pxor INC, STATE4
+	movdqu 0x30(OUTP), IN
+	pxor IN, STATE4
 	movdqu STATE4, 0x30(OUTP)
 
 	_aesni_gf128mul_x_ble()
 
 	add $64, INP
 	add $64, OUTP
-	sub $64, LEN
-	ja .Lxts_dec_loop4
+	test LEN, LEN
+	jnz .Lxts_dec_loop4
 
+.Lxts_dec_ret_iv:
 	movups IV, (IVP)
 
+.Lxts_dec_ret:
+#ifndef __x86_64__
+	popl KLEN
+	popl KEYP
+	popl LEN
+	popl IVP
+#endif
 	FRAME_END
 	ret
-SYM_FUNC_END(aesni_xts_decrypt)
 
+.Lxts_dec_1x:
+	add $64, LEN
+	jz .Lxts_dec_ret_iv
+
+.Lxts_dec_loop1:
+	movdqu (INP), STATE
+
+	add $16, INP
+	sub $16, LEN
+	jl .Lxts_dec_cts1
+
+	pxor IV, STATE
+	call _aesni_dec1
+	pxor IV, STATE
+	_aesni_gf128mul_x_ble()
+
+	test LEN, LEN
+	jz .Lxts_dec_out
+
+	movdqu STATE, (OUTP)
+	add $16, OUTP
+	jmp .Lxts_dec_loop1
+
+.Lxts_dec_out:
+	movdqu STATE, (OUTP)
+	jmp .Lxts_dec_ret_iv
+
+.Lxts_dec_cts1:
+	movdqa IV, STATE4
+	_aesni_gf128mul_x_ble()
+
+	pxor IV, STATE
+	call _aesni_dec1
+	pxor IV, STATE
+
+#ifndef __x86_64__
+	lea .Lcts_permute_table, T1
+#else
+	lea .Lcts_permute_table(%rip), T1
 #endif
+	add LEN, INP		/* rewind input pointer */
+	add $16, LEN		/* # bytes in final block */
+	movups (INP), IN1
+
+	mov T1, IVP
+	add $32, IVP
+	add LEN, T1
+	sub LEN, IVP
+	add OUTP, LEN
+
+	movups (T1), %xmm4
+	movaps STATE, IN2
+	pshufb %xmm4, STATE
+	movups STATE, (LEN)
+
+	movups (IVP), %xmm0
+	pshufb %xmm0, IN1
+	pblendvb IN2, IN1
+	movaps IN1, STATE
+
+	pxor STATE4, STATE
+	call _aesni_dec1
+	pxor STATE4, STATE
+
+	movups STATE, (OUTP)
+	jmp .Lxts_dec_ret
+SYM_FUNC_END(aesni_xts_decrypt)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 84e3ed49b35d..2116bc2b9507 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -33,9 +33,6 @@ 
 #include <crypto/internal/skcipher.h>
 #include <linux/workqueue.h>
 #include <linux/spinlock.h>
-#ifdef CONFIG_X86_64
-#include <asm/crypto/glue_helper.h>
-#endif
 
 
 #define AESNI_ALIGN	16
@@ -632,98 +629,6 @@  static int ctr_crypt(struct skcipher_request *req)
 	return err;
 }
 
-static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
-			    unsigned int keylen)
-{
-	struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-	int err;
-
-	err = xts_verify_key(tfm, key, keylen);
-	if (err)
-		return err;
-
-	keylen /= 2;
-
-	/* first half of xts-key is for crypt */
-	err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx,
-				 key, keylen);
-	if (err)
-		return err;
-
-	/* second half of xts-key is for tweak */
-	return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx,
-				  key + keylen, keylen);
-}
-
-
-static void aesni_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
-	glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_enc);
-}
-
-static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
-	glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec);
-}
-
-static void aesni_xts_enc32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
-	aesni_xts_encrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
-}
-
-static void aesni_xts_dec32(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
-{
-	aesni_xts_decrypt(ctx, dst, src, 32 * AES_BLOCK_SIZE, (u8 *)iv);
-}
-
-static const struct common_glue_ctx aesni_enc_xts = {
-	.num_funcs = 2,
-	.fpu_blocks_limit = 1,
-
-	.funcs = { {
-		.num_blocks = 32,
-		.fn_u = { .xts = aesni_xts_enc32 }
-	}, {
-		.num_blocks = 1,
-		.fn_u = { .xts = aesni_xts_enc }
-	} }
-};
-
-static const struct common_glue_ctx aesni_dec_xts = {
-	.num_funcs = 2,
-	.fpu_blocks_limit = 1,
-
-	.funcs = { {
-		.num_blocks = 32,
-		.fn_u = { .xts = aesni_xts_dec32 }
-	}, {
-		.num_blocks = 1,
-		.fn_u = { .xts = aesni_xts_dec }
-	} }
-};
-
-static int xts_encrypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	return glue_xts_req_128bit(&aesni_enc_xts, req, aesni_enc,
-				   aes_ctx(ctx->raw_tweak_ctx),
-				   aes_ctx(ctx->raw_crypt_ctx),
-				   false);
-}
-
-static int xts_decrypt(struct skcipher_request *req)
-{
-	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
-	struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
-
-	return glue_xts_req_128bit(&aesni_dec_xts, req, aesni_enc,
-				   aes_ctx(ctx->raw_tweak_ctx),
-				   aes_ctx(ctx->raw_crypt_ctx),
-				   true);
-}
-
 static int
 rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
 {
@@ -996,6 +901,128 @@  static int helper_rfc4106_decrypt(struct aead_request *req)
 }
 #endif
 
+static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
+			    unsigned int keylen)
+{
+	struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	int err;
+
+	err = xts_verify_key(tfm, key, keylen);
+	if (err)
+		return err;
+
+	keylen /= 2;
+
+	/* first half of xts-key is for crypt */
+	err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx,
+				 key, keylen);
+	if (err)
+		return err;
+
+	/* second half of xts-key is for tweak */
+	return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx,
+				  key + keylen, keylen);
+}
+
+static int xts_crypt(struct skcipher_request *req, bool encrypt)
+{
+	struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+	struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+	int tail = req->cryptlen % AES_BLOCK_SIZE;
+	struct skcipher_request subreq;
+	struct skcipher_walk walk;
+	int err;
+
+	if (req->cryptlen < AES_BLOCK_SIZE)
+		return -EINVAL;
+
+	err = skcipher_walk_virt(&walk, req, false);
+
+	if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
+		int blocks = DIV_ROUND_UP(req->cryptlen, AES_BLOCK_SIZE) - 2;
+
+		skcipher_walk_abort(&walk);
+
+		skcipher_request_set_tfm(&subreq, tfm);
+		skcipher_request_set_callback(&subreq,
+					      skcipher_request_flags(req),
+					      NULL, NULL);
+		skcipher_request_set_crypt(&subreq, req->src, req->dst,
+					   blocks * AES_BLOCK_SIZE, req->iv);
+		req = &subreq;
+		err = skcipher_walk_virt(&walk, req, false);
+	} else {
+		tail = 0;
+	}
+
+	kernel_fpu_begin();
+
+	/* calculate first value of T */
+	aesni_enc(aes_ctx(ctx->raw_tweak_ctx), walk.iv, walk.iv);
+
+	while (walk.nbytes > 0) {
+		int nbytes = walk.nbytes;
+
+		if (nbytes < walk.total)
+			nbytes &= ~(AES_BLOCK_SIZE - 1);
+
+		if (encrypt)
+			aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx),
+					  walk.dst.virt.addr, walk.src.virt.addr,
+					  nbytes, walk.iv);
+		else
+			aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx),
+					  walk.dst.virt.addr, walk.src.virt.addr,
+					  nbytes, walk.iv);
+		kernel_fpu_end();
+
+		err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+
+		if (walk.nbytes > 0)
+			kernel_fpu_begin();
+	}
+
+	if (unlikely(tail > 0 && !err)) {
+		struct scatterlist sg_src[2], sg_dst[2];
+		struct scatterlist *src, *dst;
+
+		dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+		if (req->dst != req->src)
+			dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
+
+		skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+					   req->iv);
+
+		err = skcipher_walk_virt(&walk, &subreq, false);
+		if (err)
+			return err;
+
+		kernel_fpu_begin();
+		if (encrypt)
+			aesni_xts_encrypt(aes_ctx(ctx->raw_crypt_ctx),
+					  walk.dst.virt.addr, walk.src.virt.addr,
+					  walk.nbytes, walk.iv);
+		else
+			aesni_xts_decrypt(aes_ctx(ctx->raw_crypt_ctx),
+					  walk.dst.virt.addr, walk.src.virt.addr,
+					  walk.nbytes, walk.iv);
+		kernel_fpu_end();
+
+		err = skcipher_walk_done(&walk, 0);
+	}
+	return err;
+}
+
+static int xts_encrypt(struct skcipher_request *req)
+{
+	return xts_crypt(req, true);
+}
+
+static int xts_decrypt(struct skcipher_request *req)
+{
+	return xts_crypt(req, false);
+}
+
 static struct crypto_alg aesni_cipher_alg = {
 	.cra_name		= "aes",
 	.cra_driver_name	= "aes-aesni",
@@ -1082,6 +1109,7 @@  static struct skcipher_alg aesni_skciphers[] = {
 		.setkey		= aesni_skcipher_setkey,
 		.encrypt	= ctr_crypt,
 		.decrypt	= ctr_crypt,
+#endif
 	}, {
 		.base = {
 			.cra_name		= "__xts(aes)",
@@ -1095,10 +1123,10 @@  static struct skcipher_alg aesni_skciphers[] = {
 		.min_keysize	= 2 * AES_MIN_KEY_SIZE,
 		.max_keysize	= 2 * AES_MAX_KEY_SIZE,
 		.ivsize		= AES_BLOCK_SIZE,
+		.walksize	= 2 * AES_BLOCK_SIZE,
 		.setkey		= xts_aesni_setkey,
 		.encrypt	= xts_encrypt,
 		.decrypt	= xts_decrypt,
-#endif
 	}
 };
 
diff --git a/crypto/Kconfig b/crypto/Kconfig
index a367fcfeb5d4..c48ca26e2169 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1133,7 +1133,6 @@  config CRYPTO_AES_NI_INTEL
 	select CRYPTO_LIB_AES
 	select CRYPTO_ALGAPI
 	select CRYPTO_SKCIPHER
-	select CRYPTO_GLUE_HELPER_X86 if 64BIT
 	select CRYPTO_SIMD
 	help
 	  Use Intel AES-NI instructions for AES algorithm.