diff mbox series

[v2,1/8] arm64: Import latest version of Cortex Strings' memcmp

Message ID 2889de2d41054f3f508fb3addad784a3606ef383.1622128527.git.robin.murphy@arm.com (mailing list archive)
State New, archived
Headers show
Series arm64: String function updates | expand

Commit Message

Robin Murphy May 27, 2021, 3:34 p.m. UTC
From: Sam Tebbs <sam.tebbs@arm.com>

Import the latest version of the former Cortex Strings - now
Arm Optimized Routines - memcmp function based on the upstream
code of string/aarch64/memcmp.S at commit e823e3a from
https://github.com/ARM-software/optimized-routines

Note that for simplicity Arm have chosen to contribute this code
to Linux under GPLv2 rather than the original MIT license.

Signed-off-by: Sam Tebbs <sam.tebbs@arm.com>
[ rm: update attribution and commit message ]
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---
 arch/arm64/lib/memcmp.S | 330 ++++++++++++++--------------------------
 1 file changed, 111 insertions(+), 219 deletions(-)

Comments

Mark Rutland May 27, 2021, 4:52 p.m. UTC | #1
Hi Robin,

On Thu, May 27, 2021 at 04:34:41PM +0100, Robin Murphy wrote:
> From: Sam Tebbs <sam.tebbs@arm.com>
> 
> Import the latest version of the former Cortex Strings - now
> Arm Optimized Routines - memcmp function based on the upstream
> code of string/aarch64/memcmp.S at commit e823e3a from
> https://github.com/ARM-software/optimized-routines
> 
> Note that for simplicity Arm have chosen to contribute this code
> to Linux under GPLv2 rather than the original MIT license.

Thanks for adding this! It would be nice if we could make this slightly
more explicit, e.g.

| Arm hold the copyright on this code, and have chosen to contribute it
| to linux under GPL v2 rather than the original MIT license.

... but I don't think that requires a respin.

> 
> Signed-off-by: Sam Tebbs <sam.tebbs@arm.com>
> [ rm: update attribution and commit message ]
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
>  arch/arm64/lib/memcmp.S | 330 ++++++++++++++--------------------------
>  1 file changed, 111 insertions(+), 219 deletions(-)
> 
> diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
> index c0671e793ea9..498f0d9941d9 100644
> --- a/arch/arm64/lib/memcmp.S
> +++ b/arch/arm64/lib/memcmp.S
> @@ -1,247 +1,139 @@
>  /* SPDX-License-Identifier: GPL-2.0-only */
>  /*
> - * Copyright (C) 2013 ARM Ltd.
> - * Copyright (C) 2013 Linaro.
> + * Copyright (c) 2013-2020, Arm Limited.

2021 too, presumably. ;)

>   *
> - * This code is based on glibc cortex strings work originally authored by Linaro
> - * be found @
> - *
> - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> - * files/head:/src/aarch64/
> + * Adapted from the original at:
> + * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcmp.S
>   */

Could we please note the commit ID in the comment, too? e.g. we can
place that in the URL:

https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/memcmp.S

I've checked that the below matches the code in the file referenced
above, and AFAICT, the only change I see in the body of the function is
dropping PTR_ARG() and SIZE_ARG(), which are not necessary as the kernel
itself is not built with ILP32. The local definition of L() also seems
fine to me.

On the assumption that the code was correct to begin with, this looks
good to me. With the minor fixups above:

Acked-by: Mark Rutland <mark.rutland@arm.com>

Thanks,
Mark.

>  
>  #include <linux/linkage.h>
>  #include <asm/assembler.h>
>  
> -/*
> -* compare memory areas(when two memory areas' offset are different,
> -* alignment handled by the hardware)
> -*
> -* Parameters:
> -*  x0 - const memory area 1 pointer
> -*  x1 - const memory area 2 pointer
> -*  x2 - the maximal compare byte length
> -* Returns:
> -*  x0 - a compare result, maybe less than, equal to, or greater than ZERO
> -*/
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64, unaligned accesses.
> + */
> +
> +#define L(label) .L ## label
>  
>  /* Parameters and result.  */
> -src1		.req	x0
> -src2		.req	x1
> -limit		.req	x2
> -result		.req	x0
> +#define src1		x0
> +#define src2		x1
> +#define limit		x2
> +#define result		w0
>  
>  /* Internal variables.  */
> -data1		.req	x3
> -data1w		.req	w3
> -data2		.req	x4
> -data2w		.req	w4
> -has_nul		.req	x5
> -diff		.req	x6
> -endloop		.req	x7
> -tmp1		.req	x8
> -tmp2		.req	x9
> -tmp3		.req	x10
> -pos		.req	x11
> -limit_wd	.req	x12
> -mask		.req	x13
> +#define data1		x3
> +#define data1w		w3
> +#define data1h		x4
> +#define data2		x5
> +#define data2w		w5
> +#define data2h		x6
> +#define tmp1		x7
> +#define tmp2		x8
>  
>  SYM_FUNC_START_WEAK_PI(memcmp)
> -	cbz	limit, .Lret0
> -	eor	tmp1, src1, src2
> -	tst	tmp1, #7
> -	b.ne	.Lmisaligned8
> -	ands	tmp1, src1, #7
> -	b.ne	.Lmutual_align
> -	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */
> -	lsr	limit_wd, limit_wd, #3 /* Convert to Dwords.  */
> -	/*
> -	* The input source addresses are at alignment boundary.
> -	* Directly compare eight bytes each time.
> -	*/
> -.Lloop_aligned:
> -	ldr	data1, [src1], #8
> -	ldr	data2, [src2], #8
> -.Lstart_realigned:
> -	subs	limit_wd, limit_wd, #1
> -	eor	diff, data1, data2	/* Non-zero if differences found.  */
> -	csinv	endloop, diff, xzr, cs	/* Last Dword or differences.  */
> -	cbz	endloop, .Lloop_aligned
> +	subs	limit, limit, 8
> +	b.lo	L(less8)
>  
> -	/* Not reached the limit, must have found a diff.  */
> -	tbz	limit_wd, #63, .Lnot_limit
> +	ldr	data1, [src1], 8
> +	ldr	data2, [src2], 8
> +	cmp	data1, data2
> +	b.ne	L(return)
>  
> -	/* Limit % 8 == 0 => the diff is in the last 8 bytes. */
> -	ands	limit, limit, #7
> -	b.eq	.Lnot_limit
> -	/*
> -	* The remained bytes less than 8. It is needed to extract valid data
> -	* from last eight bytes of the intended memory range.
> -	*/
> -	lsl	limit, limit, #3	/* bytes-> bits.  */
> -	mov	mask, #~0
> -CPU_BE( lsr	mask, mask, limit )
> -CPU_LE( lsl	mask, mask, limit )
> -	bic	data1, data1, mask
> -	bic	data2, data2, mask
> +	subs	limit, limit, 8
> +	b.gt	L(more16)
>  
> -	orr	diff, diff, mask
> -	b	.Lnot_limit
> +	ldr	data1, [src1, limit]
> +	ldr	data2, [src2, limit]
> +	b	L(return)
>  
> -.Lmutual_align:
> -	/*
> -	* Sources are mutually aligned, but are not currently at an
> -	* alignment boundary. Round down the addresses and then mask off
> -	* the bytes that precede the start point.
> -	*/
> -	bic	src1, src1, #7
> -	bic	src2, src2, #7
> -	ldr	data1, [src1], #8
> -	ldr	data2, [src2], #8
> -	/*
> -	* We can not add limit with alignment offset(tmp1) here. Since the
> -	* addition probably make the limit overflown.
> -	*/
> -	sub	limit_wd, limit, #1/*limit != 0, so no underflow.*/
> -	and	tmp3, limit_wd, #7
> -	lsr	limit_wd, limit_wd, #3
> -	add	tmp3, tmp3, tmp1
> -	add	limit_wd, limit_wd, tmp3, lsr #3
> -	add	limit, limit, tmp1/* Adjust the limit for the extra.  */
> +L(more16):
> +	ldr	data1, [src1], 8
> +	ldr	data2, [src2], 8
> +	cmp	data1, data2
> +	bne	L(return)
>  
> -	lsl	tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
> -	neg	tmp1, tmp1/* Bits to alignment -64.  */
> -	mov	tmp2, #~0
> -	/*mask off the non-intended bytes before the start address.*/
> -CPU_BE( lsl	tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
> -	/* Little-endian.  Early bytes are at LSB.  */
> -CPU_LE( lsr	tmp2, tmp2, tmp1 )
> +	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
> +	   strings.  */
> +	subs	limit, limit, 16
> +	b.ls	L(last_bytes)
>  
> -	orr	data1, data1, tmp2
> -	orr	data2, data2, tmp2
> -	b	.Lstart_realigned
> +	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
> +	   try to align, so limit it only to strings larger than 128 bytes.  */
> +	cmp	limit, 96
> +	b.ls	L(loop16)
>  
> -	/*src1 and src2 have different alignment offset.*/
> -.Lmisaligned8:
> -	cmp	limit, #8
> -	b.lo	.Ltiny8proc /*limit < 8: compare byte by byte*/
> +	/* Align src1 and adjust src2 with bytes not yet done.  */
> +	and	tmp1, src1, 15
> +	add	limit, limit, tmp1
> +	sub	src1, src1, tmp1
> +	sub	src2, src2, tmp1
>  
> -	and	tmp1, src1, #7
> -	neg	tmp1, tmp1
> -	add	tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
> -	and	tmp2, src2, #7
> -	neg	tmp2, tmp2
> -	add	tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
> -	subs	tmp3, tmp1, tmp2
> -	csel	pos, tmp1, tmp2, hi /*Choose the maximum.*/
> +	/* Loop performing 16 bytes per iteration using aligned src1.
> +	   Limit is pre-decremented by 16 and must be larger than zero.
> +	   Exit if <= 16 bytes left to do or if the data is not equal.  */
> +	.p2align 4
> +L(loop16):
> +	ldp	data1, data1h, [src1], 16
> +	ldp	data2, data2h, [src2], 16
> +	subs	limit, limit, 16
> +	ccmp	data1, data2, 0, hi
> +	ccmp	data1h, data2h, 0, eq
> +	b.eq	L(loop16)
>  
> -	sub	limit, limit, pos
> -	/*compare the proceeding bytes in the first 8 byte segment.*/
> -.Ltinycmp:
> -	ldrb	data1w, [src1], #1
> -	ldrb	data2w, [src2], #1
> -	subs	pos, pos, #1
> -	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000.  */
> -	b.eq	.Ltinycmp
> -	cbnz	pos, 1f /*diff occurred before the last byte.*/
> +	cmp	data1, data2
> +	bne	L(return)
> +	mov	data1, data1h
> +	mov	data2, data2h
> +	cmp	data1, data2
> +	bne	L(return)
> +
> +	/* Compare last 1-16 bytes using unaligned access.  */
> +L(last_bytes):
> +	add	src1, src1, limit
> +	add	src2, src2, limit
> +	ldp	data1, data1h, [src1]
> +	ldp	data2, data2h, [src2]
> +	cmp	data1, data2
> +	bne	L(return)
> +	mov	data1, data1h
> +	mov	data2, data2h
> +	cmp	data1, data2
> +
> +	/* Compare data bytes and set return value to 0, -1 or 1.  */
> +L(return):
> +#ifndef __AARCH64EB__
> +	rev	data1, data1
> +	rev	data2, data2
> +#endif
> +	cmp	data1, data2
> +L(ret_eq):
> +	cset	result, ne
> +	cneg	result, result, lo
> +	ret
> +
> +	.p2align 4
> +	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
> +L(less8):
> +	adds	limit, limit, 4
> +	b.lo	L(less4)
> +	ldr	data1w, [src1], 4
> +	ldr	data2w, [src2], 4
>  	cmp	data1w, data2w
> -	b.eq	.Lstart_align
> -1:
> -	sub	result, data1, data2
> +	b.ne	L(return)
> +	sub	limit, limit, 4
> +L(less4):
> +	adds	limit, limit, 4
> +	beq	L(ret_eq)
> +L(byte_loop):
> +	ldrb	data1w, [src1], 1
> +	ldrb	data2w, [src2], 1
> +	subs	limit, limit, 1
> +	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
> +	b.eq	L(byte_loop)
> +	sub	result, data1w, data2w
>  	ret
>  
> -.Lstart_align:
> -	lsr	limit_wd, limit, #3
> -	cbz	limit_wd, .Lremain8
> -
> -	ands	xzr, src1, #7
> -	b.eq	.Lrecal_offset
> -	/*process more leading bytes to make src1 aligned...*/
> -	add	src1, src1, tmp3 /*backwards src1 to alignment boundary*/
> -	add	src2, src2, tmp3
> -	sub	limit, limit, tmp3
> -	lsr	limit_wd, limit, #3
> -	cbz	limit_wd, .Lremain8
> -	/*load 8 bytes from aligned SRC1..*/
> -	ldr	data1, [src1], #8
> -	ldr	data2, [src2], #8
> -
> -	subs	limit_wd, limit_wd, #1
> -	eor	diff, data1, data2  /*Non-zero if differences found.*/
> -	csinv	endloop, diff, xzr, ne
> -	cbnz	endloop, .Lunequal_proc
> -	/*How far is the current SRC2 from the alignment boundary...*/
> -	and	tmp3, tmp3, #7
> -
> -.Lrecal_offset:/*src1 is aligned now..*/
> -	neg	pos, tmp3
> -.Lloopcmp_proc:
> -	/*
> -	* Divide the eight bytes into two parts. First,backwards the src2
> -	* to an alignment boundary,load eight bytes and compare from
> -	* the SRC2 alignment boundary. If all 8 bytes are equal,then start
> -	* the second part's comparison. Otherwise finish the comparison.
> -	* This special handle can garantee all the accesses are in the
> -	* thread/task space in avoid to overrange access.
> -	*/
> -	ldr	data1, [src1,pos]
> -	ldr	data2, [src2,pos]
> -	eor	diff, data1, data2  /* Non-zero if differences found.  */
> -	cbnz	diff, .Lnot_limit
> -
> -	/*The second part process*/
> -	ldr	data1, [src1], #8
> -	ldr	data2, [src2], #8
> -	eor	diff, data1, data2  /* Non-zero if differences found.  */
> -	subs	limit_wd, limit_wd, #1
> -	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
> -	cbz	endloop, .Lloopcmp_proc
> -.Lunequal_proc:
> -	cbz	diff, .Lremain8
> -
> -/* There is difference occurred in the latest comparison. */
> -.Lnot_limit:
> -/*
> -* For little endian,reverse the low significant equal bits into MSB,then
> -* following CLZ can find how many equal bits exist.
> -*/
> -CPU_LE( rev	diff, diff )
> -CPU_LE( rev	data1, data1 )
> -CPU_LE( rev	data2, data2 )
> -
> -	/*
> -	* The MS-non-zero bit of DIFF marks either the first bit
> -	* that is different, or the end of the significant data.
> -	* Shifting left now will bring the critical information into the
> -	* top bits.
> -	*/
> -	clz	pos, diff
> -	lsl	data1, data1, pos
> -	lsl	data2, data2, pos
> -	/*
> -	* We need to zero-extend (char is unsigned) the value and then
> -	* perform a signed subtraction.
> -	*/
> -	lsr	data1, data1, #56
> -	sub	result, data1, data2, lsr #56
> -	ret
> -
> -.Lremain8:
> -	/* Limit % 8 == 0 =>. all data are equal.*/
> -	ands	limit, limit, #7
> -	b.eq	.Lret0
> -
> -.Ltiny8proc:
> -	ldrb	data1w, [src1], #1
> -	ldrb	data2w, [src2], #1
> -	subs	limit, limit, #1
> -
> -	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000. */
> -	b.eq	.Ltiny8proc
> -	sub	result, data1, data2
> -	ret
> -.Lret0:
> -	mov	result, #0
> -	ret
>  SYM_FUNC_END_PI(memcmp)
>  EXPORT_SYMBOL_NOKASAN(memcmp)
> -- 
> 2.21.0.dirty
>
Will Deacon June 1, 2021, 6:26 p.m. UTC | #2
On Thu, May 27, 2021 at 05:52:14PM +0100, Mark Rutland wrote:
> On Thu, May 27, 2021 at 04:34:41PM +0100, Robin Murphy wrote:
> > From: Sam Tebbs <sam.tebbs@arm.com>
> > 
> > Import the latest version of the former Cortex Strings - now
> > Arm Optimized Routines - memcmp function based on the upstream
> > code of string/aarch64/memcmp.S at commit e823e3a from
> > https://github.com/ARM-software/optimized-routines
> > 
> > Note that for simplicity Arm have chosen to contribute this code
> > to Linux under GPLv2 rather than the original MIT license.
> 
> Thanks for adding this! It would be nice if we could make this slightly
> more explicit, e.g.
> 
> | Arm hold the copyright on this code, and have chosen to contribute it
> | to linux under GPL v2 rather than the original MIT license.
> 
> ... but I don't think that requires a respin.
> 
> > 
> > Signed-off-by: Sam Tebbs <sam.tebbs@arm.com>
> > [ rm: update attribution and commit message ]
> > Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> > ---
> >  arch/arm64/lib/memcmp.S | 330 ++++++++++++++--------------------------
> >  1 file changed, 111 insertions(+), 219 deletions(-)
> > 
> > diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
> > index c0671e793ea9..498f0d9941d9 100644
> > --- a/arch/arm64/lib/memcmp.S
> > +++ b/arch/arm64/lib/memcmp.S
> > @@ -1,247 +1,139 @@
> >  /* SPDX-License-Identifier: GPL-2.0-only */
> >  /*
> > - * Copyright (C) 2013 ARM Ltd.
> > - * Copyright (C) 2013 Linaro.
> > + * Copyright (c) 2013-2020, Arm Limited.
> 
> 2021 too, presumably. ;)
> 
> >   *
> > - * This code is based on glibc cortex strings work originally authored by Linaro
> > - * be found @
> > - *
> > - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> > - * files/head:/src/aarch64/
> > + * Adapted from the original at:
> > + * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcmp.S
> >   */
> 
> Could we please note the commit ID in the comment, too? e.g. we can
> place that in the URL:
> 
> https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/memcmp.S
> 
> I've checked that the below matches the code in the file referenced
> above, and AFAICT, the only change I see in the body of the function is
> dropping PTR_ARG() and SIZE_ARG(), which are not necessary as the kernel
> itself is not built with ILP32. The local definition of L() also seems
> fine to me.
> 
> On the assumption that the code was correct to begin with, this looks
> good to me. With the minor fixups above:
> 
> Acked-by: Mark Rutland <mark.rutland@arm.com>

I've applied this as-is with your Ack, but I haven't yet put it into
linux-next. Please can you (or Robin) send patches on top to address the
above?

Sorry for rushing a bit, but I'd like to get CKI chewing on this ASAP
and didn't want to wait for a respin if it's just cosmetic stuff pending.

Cheers,

Will
diff mbox series

Patch

diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S
index c0671e793ea9..498f0d9941d9 100644
--- a/arch/arm64/lib/memcmp.S
+++ b/arch/arm64/lib/memcmp.S
@@ -1,247 +1,139 @@ 
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright (C) 2013 ARM Ltd.
- * Copyright (C) 2013 Linaro.
+ * Copyright (c) 2013-2020, Arm Limited.
  *
- * This code is based on glibc cortex strings work originally authored by Linaro
- * be found @
- *
- * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
- * files/head:/src/aarch64/
+ * Adapted from the original at:
+ * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcmp.S
  */
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 
-/*
-* compare memory areas(when two memory areas' offset are different,
-* alignment handled by the hardware)
-*
-* Parameters:
-*  x0 - const memory area 1 pointer
-*  x1 - const memory area 2 pointer
-*  x2 - the maximal compare byte length
-* Returns:
-*  x0 - a compare result, maybe less than, equal to, or greater than ZERO
-*/
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ */
+
+#define L(label) .L ## label
 
 /* Parameters and result.  */
-src1		.req	x0
-src2		.req	x1
-limit		.req	x2
-result		.req	x0
+#define src1		x0
+#define src2		x1
+#define limit		x2
+#define result		w0
 
 /* Internal variables.  */
-data1		.req	x3
-data1w		.req	w3
-data2		.req	x4
-data2w		.req	w4
-has_nul		.req	x5
-diff		.req	x6
-endloop		.req	x7
-tmp1		.req	x8
-tmp2		.req	x9
-tmp3		.req	x10
-pos		.req	x11
-limit_wd	.req	x12
-mask		.req	x13
+#define data1		x3
+#define data1w		w3
+#define data1h		x4
+#define data2		x5
+#define data2w		w5
+#define data2h		x6
+#define tmp1		x7
+#define tmp2		x8
 
 SYM_FUNC_START_WEAK_PI(memcmp)
-	cbz	limit, .Lret0
-	eor	tmp1, src1, src2
-	tst	tmp1, #7
-	b.ne	.Lmisaligned8
-	ands	tmp1, src1, #7
-	b.ne	.Lmutual_align
-	sub	limit_wd, limit, #1 /* limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #3 /* Convert to Dwords.  */
-	/*
-	* The input source addresses are at alignment boundary.
-	* Directly compare eight bytes each time.
-	*/
-.Lloop_aligned:
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-.Lstart_realigned:
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2	/* Non-zero if differences found.  */
-	csinv	endloop, diff, xzr, cs	/* Last Dword or differences.  */
-	cbz	endloop, .Lloop_aligned
+	subs	limit, limit, 8
+	b.lo	L(less8)
 
-	/* Not reached the limit, must have found a diff.  */
-	tbz	limit_wd, #63, .Lnot_limit
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	b.ne	L(return)
 
-	/* Limit % 8 == 0 => the diff is in the last 8 bytes. */
-	ands	limit, limit, #7
-	b.eq	.Lnot_limit
-	/*
-	* The remained bytes less than 8. It is needed to extract valid data
-	* from last eight bytes of the intended memory range.
-	*/
-	lsl	limit, limit, #3	/* bytes-> bits.  */
-	mov	mask, #~0
-CPU_BE( lsr	mask, mask, limit )
-CPU_LE( lsl	mask, mask, limit )
-	bic	data1, data1, mask
-	bic	data2, data2, mask
+	subs	limit, limit, 8
+	b.gt	L(more16)
 
-	orr	diff, diff, mask
-	b	.Lnot_limit
+	ldr	data1, [src1, limit]
+	ldr	data2, [src2, limit]
+	b	L(return)
 
-.Lmutual_align:
-	/*
-	* Sources are mutually aligned, but are not currently at an
-	* alignment boundary. Round down the addresses and then mask off
-	* the bytes that precede the start point.
-	*/
-	bic	src1, src1, #7
-	bic	src2, src2, #7
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	/*
-	* We can not add limit with alignment offset(tmp1) here. Since the
-	* addition probably make the limit overflown.
-	*/
-	sub	limit_wd, limit, #1/*limit != 0, so no underflow.*/
-	and	tmp3, limit_wd, #7
-	lsr	limit_wd, limit_wd, #3
-	add	tmp3, tmp3, tmp1
-	add	limit_wd, limit_wd, tmp3, lsr #3
-	add	limit, limit, tmp1/* Adjust the limit for the extra.  */
+L(more16):
+	ldr	data1, [src1], 8
+	ldr	data2, [src2], 8
+	cmp	data1, data2
+	bne	L(return)
 
-	lsl	tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/
-	neg	tmp1, tmp1/* Bits to alignment -64.  */
-	mov	tmp2, #~0
-	/*mask off the non-intended bytes before the start address.*/
-CPU_BE( lsl	tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/
-	/* Little-endian.  Early bytes are at LSB.  */
-CPU_LE( lsr	tmp2, tmp2, tmp1 )
+	/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+	   strings.  */
+	subs	limit, limit, 16
+	b.ls	L(last_bytes)
 
-	orr	data1, data1, tmp2
-	orr	data2, data2, tmp2
-	b	.Lstart_realigned
+	/* We overlap loads between 0-32 bytes at either side of SRC1 when we
+	   try to align, so limit it only to strings larger than 128 bytes.  */
+	cmp	limit, 96
+	b.ls	L(loop16)
 
-	/*src1 and src2 have different alignment offset.*/
-.Lmisaligned8:
-	cmp	limit, #8
-	b.lo	.Ltiny8proc /*limit < 8: compare byte by byte*/
+	/* Align src1 and adjust src2 with bytes not yet done.  */
+	and	tmp1, src1, 15
+	add	limit, limit, tmp1
+	sub	src1, src1, tmp1
+	sub	src2, src2, tmp1
 
-	and	tmp1, src1, #7
-	neg	tmp1, tmp1
-	add	tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/
-	and	tmp2, src2, #7
-	neg	tmp2, tmp2
-	add	tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/
-	subs	tmp3, tmp1, tmp2
-	csel	pos, tmp1, tmp2, hi /*Choose the maximum.*/
+	/* Loop performing 16 bytes per iteration using aligned src1.
+	   Limit is pre-decremented by 16 and must be larger than zero.
+	   Exit if <= 16 bytes left to do or if the data is not equal.  */
+	.p2align 4
+L(loop16):
+	ldp	data1, data1h, [src1], 16
+	ldp	data2, data2h, [src2], 16
+	subs	limit, limit, 16
+	ccmp	data1, data2, 0, hi
+	ccmp	data1h, data2h, 0, eq
+	b.eq	L(loop16)
 
-	sub	limit, limit, pos
-	/*compare the proceeding bytes in the first 8 byte segment.*/
-.Ltinycmp:
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	pos, pos, #1
-	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000.  */
-	b.eq	.Ltinycmp
-	cbnz	pos, 1f /*diff occurred before the last byte.*/
+	cmp	data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+	bne	L(return)
+
+	/* Compare last 1-16 bytes using unaligned access.  */
+L(last_bytes):
+	add	src1, src1, limit
+	add	src2, src2, limit
+	ldp	data1, data1h, [src1]
+	ldp	data2, data2h, [src2]
+	cmp	data1, data2
+	bne	L(return)
+	mov	data1, data1h
+	mov	data2, data2h
+	cmp	data1, data2
+
+	/* Compare data bytes and set return value to 0, -1 or 1.  */
+L(return):
+#ifndef __AARCH64EB__
+	rev	data1, data1
+	rev	data2, data2
+#endif
+	cmp	data1, data2
+L(ret_eq):
+	cset	result, ne
+	cneg	result, result, lo
+	ret
+
+	.p2align 4
+	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
+L(less8):
+	adds	limit, limit, 4
+	b.lo	L(less4)
+	ldr	data1w, [src1], 4
+	ldr	data2w, [src2], 4
 	cmp	data1w, data2w
-	b.eq	.Lstart_align
-1:
-	sub	result, data1, data2
+	b.ne	L(return)
+	sub	limit, limit, 4
+L(less4):
+	adds	limit, limit, 4
+	beq	L(ret_eq)
+L(byte_loop):
+	ldrb	data1w, [src1], 1
+	ldrb	data2w, [src2], 1
+	subs	limit, limit, 1
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
+	b.eq	L(byte_loop)
+	sub	result, data1w, data2w
 	ret
 
-.Lstart_align:
-	lsr	limit_wd, limit, #3
-	cbz	limit_wd, .Lremain8
-
-	ands	xzr, src1, #7
-	b.eq	.Lrecal_offset
-	/*process more leading bytes to make src1 aligned...*/
-	add	src1, src1, tmp3 /*backwards src1 to alignment boundary*/
-	add	src2, src2, tmp3
-	sub	limit, limit, tmp3
-	lsr	limit_wd, limit, #3
-	cbz	limit_wd, .Lremain8
-	/*load 8 bytes from aligned SRC1..*/
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-
-	subs	limit_wd, limit_wd, #1
-	eor	diff, data1, data2  /*Non-zero if differences found.*/
-	csinv	endloop, diff, xzr, ne
-	cbnz	endloop, .Lunequal_proc
-	/*How far is the current SRC2 from the alignment boundary...*/
-	and	tmp3, tmp3, #7
-
-.Lrecal_offset:/*src1 is aligned now..*/
-	neg	pos, tmp3
-.Lloopcmp_proc:
-	/*
-	* Divide the eight bytes into two parts. First,backwards the src2
-	* to an alignment boundary,load eight bytes and compare from
-	* the SRC2 alignment boundary. If all 8 bytes are equal,then start
-	* the second part's comparison. Otherwise finish the comparison.
-	* This special handle can garantee all the accesses are in the
-	* thread/task space in avoid to overrange access.
-	*/
-	ldr	data1, [src1,pos]
-	ldr	data2, [src2,pos]
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	cbnz	diff, .Lnot_limit
-
-	/*The second part process*/
-	ldr	data1, [src1], #8
-	ldr	data2, [src2], #8
-	eor	diff, data1, data2  /* Non-zero if differences found.  */
-	subs	limit_wd, limit_wd, #1
-	csinv	endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/
-	cbz	endloop, .Lloopcmp_proc
-.Lunequal_proc:
-	cbz	diff, .Lremain8
-
-/* There is difference occurred in the latest comparison. */
-.Lnot_limit:
-/*
-* For little endian,reverse the low significant equal bits into MSB,then
-* following CLZ can find how many equal bits exist.
-*/
-CPU_LE( rev	diff, diff )
-CPU_LE( rev	data1, data1 )
-CPU_LE( rev	data2, data2 )
-
-	/*
-	* The MS-non-zero bit of DIFF marks either the first bit
-	* that is different, or the end of the significant data.
-	* Shifting left now will bring the critical information into the
-	* top bits.
-	*/
-	clz	pos, diff
-	lsl	data1, data1, pos
-	lsl	data2, data2, pos
-	/*
-	* We need to zero-extend (char is unsigned) the value and then
-	* perform a signed subtraction.
-	*/
-	lsr	data1, data1, #56
-	sub	result, data1, data2, lsr #56
-	ret
-
-.Lremain8:
-	/* Limit % 8 == 0 =>. all data are equal.*/
-	ands	limit, limit, #7
-	b.eq	.Lret0
-
-.Ltiny8proc:
-	ldrb	data1w, [src1], #1
-	ldrb	data2w, [src2], #1
-	subs	limit, limit, #1
-
-	ccmp	data1w, data2w, #0, ne  /* NZCV = 0b0000. */
-	b.eq	.Ltiny8proc
-	sub	result, data1, data2
-	ret
-.Lret0:
-	mov	result, #0
-	ret
 SYM_FUNC_END_PI(memcmp)
 EXPORT_SYMBOL_NOKASAN(memcmp)