diff mbox

[V3] crypto: x86/sha1 : Fix reads beyond the number of blocks passed

Message ID 1501694015-10203-1-git-send-email-megha.dey@linux.intel.com (mailing list archive)
State Superseded
Delegated to: Herbert Xu
Headers show

Commit Message

Dey, Megha Aug. 2, 2017, 5:13 p.m. UTC
It was reported that the sha1 AVX2 function(sha1_transform_avx2) is
reading ahead beyond its intended data, and causing a crash if the next
block is beyond page boundary:
http://marc.info/?l=linux-crypto-vger&m=149373371023377

This patch makes sure that there is no overflow for any buffer length.

It passes the tests written by Jan Stancek that revealed this problem:
https://github.com/jstancek/sha1-avx2-crash

I have re-enabled sha1-avx2 by reverting commit
b82ce24426a4071da9529d726057e4e642948667

Originally-by: Ilya Albrekht <ilya.albrekht@intel.com>
Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
Reported-by: Jan Stancek <jstancek@redhat.com>
---
 arch/x86/crypto/sha1_avx2_x86_64_asm.S | 67 ++++++++++++++++++----------------
 arch/x86/crypto/sha1_ssse3_glue.c      |  2 +-
 2 files changed, 37 insertions(+), 32 deletions(-)

Comments

Dey, Megha Aug. 2, 2017, 6:35 p.m. UTC | #1
On Wed, 2017-08-02 at 10:13 -0700, Megha Dey wrote:
> It was reported that the sha1 AVX2 function(sha1_transform_avx2) is
> reading ahead beyond its intended data, and causing a crash if the next
> block is beyond page boundary:
> http://marc.info/?l=linux-crypto-vger&m=149373371023377
> 
> This patch makes sure that there is no overflow for any buffer length.
> 
> It passes the tests written by Jan Stancek that revealed this problem:
> https://github.com/jstancek/sha1-avx2-crash

Hi Jan,

Is it ok to add your Tested-by?

> 
> I have re-enabled sha1-avx2 by reverting commit
> b82ce24426a4071da9529d726057e4e642948667
> 
> Originally-by: Ilya Albrekht <ilya.albrekht@intel.com>
> Signed-off-by: Megha Dey <megha.dey@linux.intel.com>
> Reported-by: Jan Stancek <jstancek@redhat.com>
> ---
>  arch/x86/crypto/sha1_avx2_x86_64_asm.S | 67 ++++++++++++++++++----------------
>  arch/x86/crypto/sha1_ssse3_glue.c      |  2 +-
>  2 files changed, 37 insertions(+), 32 deletions(-)
> 
> diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
> index 1cd792d..1eab79c 100644
> --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
> +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
> @@ -117,11 +117,10 @@
>  	.set T1, REG_T1
>  .endm
>  
> -#define K_BASE		%r8
>  #define HASH_PTR	%r9
> +#define BLOCKS_CTR	%r8
>  #define BUFFER_PTR	%r10
>  #define BUFFER_PTR2	%r13
> -#define BUFFER_END	%r11
>  
>  #define PRECALC_BUF	%r14
>  #define WK_BUF		%r15
> @@ -205,14 +204,14 @@
>  		 * blended AVX2 and ALU instruction scheduling
>  		 * 1 vector iteration per 8 rounds
>  		 */
> -		vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
> +		vmovdqu (i * 2)(BUFFER_PTR), W_TMP
>  	.elseif ((i & 7) == 1)
> -		vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
> +		vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
>  			 WY_TMP, WY_TMP
>  	.elseif ((i & 7) == 2)
>  		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
>  	.elseif ((i & 7) == 4)
> -		vpaddd  K_XMM(K_BASE), WY, WY_TMP
> +		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
>  	.elseif ((i & 7) == 7)
>  		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
>  
> @@ -255,7 +254,7 @@
>  		vpxor	WY, WY_TMP, WY_TMP
>  	.elseif ((i & 7) == 7)
>  		vpxor	WY_TMP2, WY_TMP, WY
> -		vpaddd	K_XMM(K_BASE), WY, WY_TMP
> +		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
>  		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
>  
>  		PRECALC_ROTATE_WY
> @@ -291,7 +290,7 @@
>  		vpsrld	$30, WY, WY
>  		vpor	WY, WY_TMP, WY
>  	.elseif ((i & 7) == 7)
> -		vpaddd	K_XMM(K_BASE), WY, WY_TMP
> +		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
>  		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
>  
>  		PRECALC_ROTATE_WY
> @@ -446,6 +445,16 @@
>  
>  .endm
>  
> +/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
> + * %1 + %2 >= %3 ? %4 : 0
> + */
> +.macro ADD_IF_GE a, b, c, d
> +	mov     \a, RTA
> +	add     $\d, RTA
> +	cmp     $\c, \b
> +	cmovge  RTA, \a
> +.endm
> +
>  /*
>   * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
>   */
> @@ -463,13 +472,16 @@
>  	lea	(2*4*80+32)(%rsp), WK_BUF
>  
>  	# Precalc WK for first 2 blocks
> -	PRECALC_OFFSET = 0
> +	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
>  	.set i, 0
>  	.rept    160
>  		PRECALC i
>  		.set i, i + 1
>  	.endr
> -	PRECALC_OFFSET = 128
> +
> +	/* Go to next block if needed */
> +	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
> +	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
>  	xchg	WK_BUF, PRECALC_BUF
>  
>  	.align 32
> @@ -479,8 +491,8 @@ _loop:
>  	 * we use K_BASE value as a signal of a last block,
>  	 * it is set below by: cmovae BUFFER_PTR, K_BASE
>  	 */
> -	cmp	K_BASE, BUFFER_PTR
> -	jne	_begin
> +	test BLOCKS_CTR, BLOCKS_CTR
> +	jnz _begin
>  	.align 32
>  	jmp	_end
>  	.align 32
> @@ -512,10 +524,10 @@ _loop0:
>  		.set j, j+2
>  	.endr
>  
> -	add	$(2*64), BUFFER_PTR       /* move to next odd-64-byte block */
> -	cmp	BUFFER_END, BUFFER_PTR    /* is current block the last one? */
> -	cmovae	K_BASE, BUFFER_PTR	/* signal the last iteration smartly */
> -
> +	/* Update Counter */
> +	sub $1, BLOCKS_CTR
> +	/* Move to the next block only if needed*/
> +	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
>  	/*
>  	 * rounds
>  	 * 60,62,64,66,68
> @@ -532,8 +544,8 @@ _loop0:
>  	UPDATE_HASH	12(HASH_PTR), D
>  	UPDATE_HASH	16(HASH_PTR), E
>  
> -	cmp	K_BASE, BUFFER_PTR	/* is current block the last one? */
> -	je	_loop
> +	test	BLOCKS_CTR, BLOCKS_CTR
> +	jz	_loop
>  
>  	mov	TB, B
>  
> @@ -575,10 +587,10 @@ _loop2:
>  		.set j, j+2
>  	.endr
>  
> -	add	$(2*64), BUFFER_PTR2      /* move to next even-64-byte block */
> -
> -	cmp	BUFFER_END, BUFFER_PTR2   /* is current block the last one */
> -	cmovae	K_BASE, BUFFER_PTR       /* signal the last iteration smartly */
> +	/* update counter */
> +	sub     $1, BLOCKS_CTR
> +	/* Move to the next block only if needed*/
> +	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
>  
>  	jmp	_loop3
>  _loop3:
> @@ -641,19 +653,12 @@ _loop3:
>  
>  	avx2_zeroupper
>  
> -	lea	K_XMM_AR(%rip), K_BASE
> -
> +	/* Setup initial values */
>  	mov	CTX, HASH_PTR
>  	mov	BUF, BUFFER_PTR
> -	lea	64(BUF), BUFFER_PTR2
> -
> -	shl	$6, CNT			/* mul by 64 */
> -	add	BUF, CNT
> -	add	$64, CNT
> -	mov	CNT, BUFFER_END
>  
> -	cmp	BUFFER_END, BUFFER_PTR2
> -	cmovae	K_BASE, BUFFER_PTR2
> +	mov	BUF, BUFFER_PTR2
> +	mov	CNT, BLOCKS_CTR
>  
>  	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
>  
> diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
> index f960a04..fc61739 100644
> --- a/arch/x86/crypto/sha1_ssse3_glue.c
> +++ b/arch/x86/crypto/sha1_ssse3_glue.c
> @@ -201,7 +201,7 @@ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
>  
>  static bool avx2_usable(void)
>  {
> -	if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
> +	if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
>  		&& boot_cpu_has(X86_FEATURE_BMI1)
>  		&& boot_cpu_has(X86_FEATURE_BMI2))
>  		return true;
Jan Stancek Aug. 2, 2017, 7:18 p.m. UTC | #2
----- Original Message -----
> On Wed, 2017-08-02 at 10:13 -0700, Megha Dey wrote:
> > It was reported that the sha1 AVX2 function(sha1_transform_avx2) is
> > reading ahead beyond its intended data, and causing a crash if the next
> > block is beyond page boundary:
> > http://marc.info/?l=linux-crypto-vger&m=149373371023377
> > 
> > This patch makes sure that there is no overflow for any buffer length.
> > 
> > It passes the tests written by Jan Stancek that revealed this problem:
> > https://github.com/jstancek/sha1-avx2-crash
> 
> Hi Jan,
> 
> Is it ok to add your Tested-by?

Yes, v3 patch is exactly the diff I was testing.

Regards,
Jan
diff mbox

Patch

diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index 1cd792d..1eab79c 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -117,11 +117,10 @@ 
 	.set T1, REG_T1
 .endm
 
-#define K_BASE		%r8
 #define HASH_PTR	%r9
+#define BLOCKS_CTR	%r8
 #define BUFFER_PTR	%r10
 #define BUFFER_PTR2	%r13
-#define BUFFER_END	%r11
 
 #define PRECALC_BUF	%r14
 #define WK_BUF		%r15
@@ -205,14 +204,14 @@ 
 		 * blended AVX2 and ALU instruction scheduling
 		 * 1 vector iteration per 8 rounds
 		 */
-		vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP
+		vmovdqu (i * 2)(BUFFER_PTR), W_TMP
 	.elseif ((i & 7) == 1)
-		vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\
+		vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
 			 WY_TMP, WY_TMP
 	.elseif ((i & 7) == 2)
 		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
 	.elseif ((i & 7) == 4)
-		vpaddd  K_XMM(K_BASE), WY, WY_TMP
+		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 	.elseif ((i & 7) == 7)
 		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
 
@@ -255,7 +254,7 @@ 
 		vpxor	WY, WY_TMP, WY_TMP
 	.elseif ((i & 7) == 7)
 		vpxor	WY_TMP2, WY_TMP, WY
-		vpaddd	K_XMM(K_BASE), WY, WY_TMP
+		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
 
 		PRECALC_ROTATE_WY
@@ -291,7 +290,7 @@ 
 		vpsrld	$30, WY, WY
 		vpor	WY, WY_TMP, WY
 	.elseif ((i & 7) == 7)
-		vpaddd	K_XMM(K_BASE), WY, WY_TMP
+		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
 		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
 
 		PRECALC_ROTATE_WY
@@ -446,6 +445,16 @@ 
 
 .endm
 
+/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
+ * %1 + %2 >= %3 ? %4 : 0
+ */
+.macro ADD_IF_GE a, b, c, d
+	mov     \a, RTA
+	add     $\d, RTA
+	cmp     $\c, \b
+	cmovge  RTA, \a
+.endm
+
 /*
  * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
  */
@@ -463,13 +472,16 @@ 
 	lea	(2*4*80+32)(%rsp), WK_BUF
 
 	# Precalc WK for first 2 blocks
-	PRECALC_OFFSET = 0
+	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
 	.set i, 0
 	.rept    160
 		PRECALC i
 		.set i, i + 1
 	.endr
-	PRECALC_OFFSET = 128
+
+	/* Go to next block if needed */
+	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
+	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
 	xchg	WK_BUF, PRECALC_BUF
 
 	.align 32
@@ -479,8 +491,8 @@  _loop:
 	 * we use K_BASE value as a signal of a last block,
 	 * it is set below by: cmovae BUFFER_PTR, K_BASE
 	 */
-	cmp	K_BASE, BUFFER_PTR
-	jne	_begin
+	test BLOCKS_CTR, BLOCKS_CTR
+	jnz _begin
 	.align 32
 	jmp	_end
 	.align 32
@@ -512,10 +524,10 @@  _loop0:
 		.set j, j+2
 	.endr
 
-	add	$(2*64), BUFFER_PTR       /* move to next odd-64-byte block */
-	cmp	BUFFER_END, BUFFER_PTR    /* is current block the last one? */
-	cmovae	K_BASE, BUFFER_PTR	/* signal the last iteration smartly */
-
+	/* Update Counter */
+	sub $1, BLOCKS_CTR
+	/* Move to the next block only if needed*/
+	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
 	/*
 	 * rounds
 	 * 60,62,64,66,68
@@ -532,8 +544,8 @@  _loop0:
 	UPDATE_HASH	12(HASH_PTR), D
 	UPDATE_HASH	16(HASH_PTR), E
 
-	cmp	K_BASE, BUFFER_PTR	/* is current block the last one? */
-	je	_loop
+	test	BLOCKS_CTR, BLOCKS_CTR
+	jz	_loop
 
 	mov	TB, B
 
@@ -575,10 +587,10 @@  _loop2:
 		.set j, j+2
 	.endr
 
-	add	$(2*64), BUFFER_PTR2      /* move to next even-64-byte block */
-
-	cmp	BUFFER_END, BUFFER_PTR2   /* is current block the last one */
-	cmovae	K_BASE, BUFFER_PTR       /* signal the last iteration smartly */
+	/* update counter */
+	sub     $1, BLOCKS_CTR
+	/* Move to the next block only if needed*/
+	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
 
 	jmp	_loop3
 _loop3:
@@ -641,19 +653,12 @@  _loop3:
 
 	avx2_zeroupper
 
-	lea	K_XMM_AR(%rip), K_BASE
-
+	/* Setup initial values */
 	mov	CTX, HASH_PTR
 	mov	BUF, BUFFER_PTR
-	lea	64(BUF), BUFFER_PTR2
-
-	shl	$6, CNT			/* mul by 64 */
-	add	BUF, CNT
-	add	$64, CNT
-	mov	CNT, BUFFER_END
 
-	cmp	BUFFER_END, BUFFER_PTR2
-	cmovae	K_BASE, BUFFER_PTR2
+	mov	BUF, BUFFER_PTR2
+	mov	CNT, BLOCKS_CTR
 
 	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
 
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index f960a04..fc61739 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -201,7 +201,7 @@  asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
 
 static bool avx2_usable(void)
 {
-	if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
+	if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2)
 		&& boot_cpu_has(X86_FEATURE_BMI1)
 		&& boot_cpu_has(X86_FEATURE_BMI2))
 		return true;