Message ID | cd902b7d46ce5fe127ce02a2d98dbbb0429b2461.1620738177.git.robin.murphy@arm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | arm64: String function updates | expand |
Hi Robin, On Tue, May 11, 2021 at 05:12:31PM +0100, Robin Murphy wrote: > From: Sam Tebbs <sam.tebbs@arm.com> > > Import the latest version of the former Cortex Strings - now > Arm Optimized Routines - memcmp function based on the upstream > code of string/aarch64/memcmp.S at commit e823e3a from > https://github.com/ARM-software/optimized-routines What's the licensing/copyright situation here? Because below... > > Signed-off-by: Sam Tebbs <sam.tebbs@arm.com> > [ rm: update attribution and commit message ] > Signed-off-by: Robin Murphy <robin.murphy@arm.com> > --- > arch/arm64/lib/memcmp.S | 330 ++++++++++++++-------------------------- > 1 file changed, 111 insertions(+), 219 deletions(-) > > diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S > index c0671e793ea9..498f0d9941d9 100644 > --- a/arch/arm64/lib/memcmp.S > +++ b/arch/arm64/lib/memcmp.S > @@ -1,247 +1,139 @@ > /* SPDX-License-Identifier: GPL-2.0-only */ ... this says GPL-2.0-only .... > /* > - * Copyright (C) 2013 ARM Ltd. > - * Copyright (C) 2013 Linaro. > + * Copyright (c) 2013-2020, Arm Limited. > * > - * This code is based on glibc cortex strings work originally authored by Linaro > - * be found @ > - * > - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ > - * files/head:/src/aarch64/ > + * Adapted from the original at: > + * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcmp.S > */ ... but this referenced file says "SPDX-License-Identifier: MIT", and I don't know when this relicensing is legitimate. Thanks, Mark. > > #include <linux/linkage.h> > #include <asm/assembler.h> > > -/* > -* compare memory areas(when two memory areas' offset are different, > -* alignment handled by the hardware) > -* > -* Parameters: > -* x0 - const memory area 1 pointer > -* x1 - const memory area 2 pointer > -* x2 - the maximal compare byte length > -* Returns: > -* x0 - a compare result, maybe less than, equal to, or greater than ZERO > -*/ > +/* Assumptions: > + * > + * ARMv8-a, AArch64, unaligned accesses. > + */ > + > +#define L(label) .L ## label > > /* Parameters and result. */ > -src1 .req x0 > -src2 .req x1 > -limit .req x2 > -result .req x0 > +#define src1 x0 > +#define src2 x1 > +#define limit x2 > +#define result w0 > > /* Internal variables. */ > -data1 .req x3 > -data1w .req w3 > -data2 .req x4 > -data2w .req w4 > -has_nul .req x5 > -diff .req x6 > -endloop .req x7 > -tmp1 .req x8 > -tmp2 .req x9 > -tmp3 .req x10 > -pos .req x11 > -limit_wd .req x12 > -mask .req x13 > +#define data1 x3 > +#define data1w w3 > +#define data1h x4 > +#define data2 x5 > +#define data2w w5 > +#define data2h x6 > +#define tmp1 x7 > +#define tmp2 x8 > > SYM_FUNC_START_WEAK_PI(memcmp) > - cbz limit, .Lret0 > - eor tmp1, src1, src2 > - tst tmp1, #7 > - b.ne .Lmisaligned8 > - ands tmp1, src1, #7 > - b.ne .Lmutual_align > - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ > - lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ > - /* > - * The input source addresses are at alignment boundary. > - * Directly compare eight bytes each time. > - */ > -.Lloop_aligned: > - ldr data1, [src1], #8 > - ldr data2, [src2], #8 > -.Lstart_realigned: > - subs limit_wd, limit_wd, #1 > - eor diff, data1, data2 /* Non-zero if differences found. */ > - csinv endloop, diff, xzr, cs /* Last Dword or differences. */ > - cbz endloop, .Lloop_aligned > + subs limit, limit, 8 > + b.lo L(less8) > > - /* Not reached the limit, must have found a diff. */ > - tbz limit_wd, #63, .Lnot_limit > + ldr data1, [src1], 8 > + ldr data2, [src2], 8 > + cmp data1, data2 > + b.ne L(return) > > - /* Limit % 8 == 0 => the diff is in the last 8 bytes. */ > - ands limit, limit, #7 > - b.eq .Lnot_limit > - /* > - * The remained bytes less than 8. It is needed to extract valid data > - * from last eight bytes of the intended memory range. > - */ > - lsl limit, limit, #3 /* bytes-> bits. */ > - mov mask, #~0 > -CPU_BE( lsr mask, mask, limit ) > -CPU_LE( lsl mask, mask, limit ) > - bic data1, data1, mask > - bic data2, data2, mask > + subs limit, limit, 8 > + b.gt L(more16) > > - orr diff, diff, mask > - b .Lnot_limit > + ldr data1, [src1, limit] > + ldr data2, [src2, limit] > + b L(return) > > -.Lmutual_align: > - /* > - * Sources are mutually aligned, but are not currently at an > - * alignment boundary. Round down the addresses and then mask off > - * the bytes that precede the start point. > - */ > - bic src1, src1, #7 > - bic src2, src2, #7 > - ldr data1, [src1], #8 > - ldr data2, [src2], #8 > - /* > - * We can not add limit with alignment offset(tmp1) here. Since the > - * addition probably make the limit overflown. > - */ > - sub limit_wd, limit, #1/*limit != 0, so no underflow.*/ > - and tmp3, limit_wd, #7 > - lsr limit_wd, limit_wd, #3 > - add tmp3, tmp3, tmp1 > - add limit_wd, limit_wd, tmp3, lsr #3 > - add limit, limit, tmp1/* Adjust the limit for the extra. */ > +L(more16): > + ldr data1, [src1], 8 > + ldr data2, [src2], 8 > + cmp data1, data2 > + bne L(return) > > - lsl tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/ > - neg tmp1, tmp1/* Bits to alignment -64. */ > - mov tmp2, #~0 > - /*mask off the non-intended bytes before the start address.*/ > -CPU_BE( lsl tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/ > - /* Little-endian. Early bytes are at LSB. */ > -CPU_LE( lsr tmp2, tmp2, tmp1 ) > + /* Jump directly to comparing the last 16 bytes for 32 byte (or less) > + strings. */ > + subs limit, limit, 16 > + b.ls L(last_bytes) > > - orr data1, data1, tmp2 > - orr data2, data2, tmp2 > - b .Lstart_realigned > + /* We overlap loads between 0-32 bytes at either side of SRC1 when we > + try to align, so limit it only to strings larger than 128 bytes. */ > + cmp limit, 96 > + b.ls L(loop16) > > - /*src1 and src2 have different alignment offset.*/ > -.Lmisaligned8: > - cmp limit, #8 > - b.lo .Ltiny8proc /*limit < 8: compare byte by byte*/ > + /* Align src1 and adjust src2 with bytes not yet done. */ > + and tmp1, src1, 15 > + add limit, limit, tmp1 > + sub src1, src1, tmp1 > + sub src2, src2, tmp1 > > - and tmp1, src1, #7 > - neg tmp1, tmp1 > - add tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/ > - and tmp2, src2, #7 > - neg tmp2, tmp2 > - add tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/ > - subs tmp3, tmp1, tmp2 > - csel pos, tmp1, tmp2, hi /*Choose the maximum.*/ > + /* Loop performing 16 bytes per iteration using aligned src1. > + Limit is pre-decremented by 16 and must be larger than zero. > + Exit if <= 16 bytes left to do or if the data is not equal. */ > + .p2align 4 > +L(loop16): > + ldp data1, data1h, [src1], 16 > + ldp data2, data2h, [src2], 16 > + subs limit, limit, 16 > + ccmp data1, data2, 0, hi > + ccmp data1h, data2h, 0, eq > + b.eq L(loop16) > > - sub limit, limit, pos > - /*compare the proceeding bytes in the first 8 byte segment.*/ > -.Ltinycmp: > - ldrb data1w, [src1], #1 > - ldrb data2w, [src2], #1 > - subs pos, pos, #1 > - ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ > - b.eq .Ltinycmp > - cbnz pos, 1f /*diff occurred before the last byte.*/ > + cmp data1, data2 > + bne L(return) > + mov data1, data1h > + mov data2, data2h > + cmp data1, data2 > + bne L(return) > + > + /* Compare last 1-16 bytes using unaligned access. */ > +L(last_bytes): > + add src1, src1, limit > + add src2, src2, limit > + ldp data1, data1h, [src1] > + ldp data2, data2h, [src2] > + cmp data1, data2 > + bne L(return) > + mov data1, data1h > + mov data2, data2h > + cmp data1, data2 > + > + /* Compare data bytes and set return value to 0, -1 or 1. */ > +L(return): > +#ifndef __AARCH64EB__ > + rev data1, data1 > + rev data2, data2 > +#endif > + cmp data1, data2 > +L(ret_eq): > + cset result, ne > + cneg result, result, lo > + ret > + > + .p2align 4 > + /* Compare up to 8 bytes. Limit is [-8..-1]. */ > +L(less8): > + adds limit, limit, 4 > + b.lo L(less4) > + ldr data1w, [src1], 4 > + ldr data2w, [src2], 4 > cmp data1w, data2w > - b.eq .Lstart_align > -1: > - sub result, data1, data2 > + b.ne L(return) > + sub limit, limit, 4 > +L(less4): > + adds limit, limit, 4 > + beq L(ret_eq) > +L(byte_loop): > + ldrb data1w, [src1], 1 > + ldrb data2w, [src2], 1 > + subs limit, limit, 1 > + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ > + b.eq L(byte_loop) > + sub result, data1w, data2w > ret > > -.Lstart_align: > - lsr limit_wd, limit, #3 > - cbz limit_wd, .Lremain8 > - > - ands xzr, src1, #7 > - b.eq .Lrecal_offset > - /*process more leading bytes to make src1 aligned...*/ > - add src1, src1, tmp3 /*backwards src1 to alignment boundary*/ > - add src2, src2, tmp3 > - sub limit, limit, tmp3 > - lsr limit_wd, limit, #3 > - cbz limit_wd, .Lremain8 > - /*load 8 bytes from aligned SRC1..*/ > - ldr data1, [src1], #8 > - ldr data2, [src2], #8 > - > - subs limit_wd, limit_wd, #1 > - eor diff, data1, data2 /*Non-zero if differences found.*/ > - csinv endloop, diff, xzr, ne > - cbnz endloop, .Lunequal_proc > - /*How far is the current SRC2 from the alignment boundary...*/ > - and tmp3, tmp3, #7 > - > -.Lrecal_offset:/*src1 is aligned now..*/ > - neg pos, tmp3 > -.Lloopcmp_proc: > - /* > - * Divide the eight bytes into two parts. First,backwards the src2 > - * to an alignment boundary,load eight bytes and compare from > - * the SRC2 alignment boundary. If all 8 bytes are equal,then start > - * the second part's comparison. Otherwise finish the comparison. > - * This special handle can garantee all the accesses are in the > - * thread/task space in avoid to overrange access. > - */ > - ldr data1, [src1,pos] > - ldr data2, [src2,pos] > - eor diff, data1, data2 /* Non-zero if differences found. */ > - cbnz diff, .Lnot_limit > - > - /*The second part process*/ > - ldr data1, [src1], #8 > - ldr data2, [src2], #8 > - eor diff, data1, data2 /* Non-zero if differences found. */ > - subs limit_wd, limit_wd, #1 > - csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ > - cbz endloop, .Lloopcmp_proc > -.Lunequal_proc: > - cbz diff, .Lremain8 > - > -/* There is difference occurred in the latest comparison. */ > -.Lnot_limit: > -/* > -* For little endian,reverse the low significant equal bits into MSB,then > -* following CLZ can find how many equal bits exist. > -*/ > -CPU_LE( rev diff, diff ) > -CPU_LE( rev data1, data1 ) > -CPU_LE( rev data2, data2 ) > - > - /* > - * The MS-non-zero bit of DIFF marks either the first bit > - * that is different, or the end of the significant data. > - * Shifting left now will bring the critical information into the > - * top bits. > - */ > - clz pos, diff > - lsl data1, data1, pos > - lsl data2, data2, pos > - /* > - * We need to zero-extend (char is unsigned) the value and then > - * perform a signed subtraction. > - */ > - lsr data1, data1, #56 > - sub result, data1, data2, lsr #56 > - ret > - > -.Lremain8: > - /* Limit % 8 == 0 =>. all data are equal.*/ > - ands limit, limit, #7 > - b.eq .Lret0 > - > -.Ltiny8proc: > - ldrb data1w, [src1], #1 > - ldrb data2w, [src2], #1 > - subs limit, limit, #1 > - > - ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ > - b.eq .Ltiny8proc > - sub result, data1, data2 > - ret > -.Lret0: > - mov result, #0 > - ret > SYM_FUNC_END_PI(memcmp) > EXPORT_SYMBOL_NOKASAN(memcmp) > -- > 2.21.0.dirty > > > _______________________________________________ > linux-arm-kernel mailing list > linux-arm-kernel@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
[ Dropping Kai Shen who is now bouncing, adding Szabolcs just in case ] On 2021-05-12 14:28, Mark Rutland wrote: > Hi Robin, > > On Tue, May 11, 2021 at 05:12:31PM +0100, Robin Murphy wrote: >> From: Sam Tebbs <sam.tebbs@arm.com> >> >> Import the latest version of the former Cortex Strings - now >> Arm Optimized Routines - memcmp function based on the upstream >> code of string/aarch64/memcmp.S at commit e823e3a from >> https://github.com/ARM-software/optimized-routines > > What's the licensing/copyright situation here? > > Because below... > >> >> Signed-off-by: Sam Tebbs <sam.tebbs@arm.com> >> [ rm: update attribution and commit message ] >> Signed-off-by: Robin Murphy <robin.murphy@arm.com> >> --- >> arch/arm64/lib/memcmp.S | 330 ++++++++++++++-------------------------- >> 1 file changed, 111 insertions(+), 219 deletions(-) >> >> diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S >> index c0671e793ea9..498f0d9941d9 100644 >> --- a/arch/arm64/lib/memcmp.S >> +++ b/arch/arm64/lib/memcmp.S >> @@ -1,247 +1,139 @@ >> /* SPDX-License-Identifier: GPL-2.0-only */ > > ... this says GPL-2.0-only .... > >> /* >> - * Copyright (C) 2013 ARM Ltd. >> - * Copyright (C) 2013 Linaro. >> + * Copyright (c) 2013-2020, Arm Limited. >> * >> - * This code is based on glibc cortex strings work originally authored by Linaro >> - * be found @ >> - * >> - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ >> - * files/head:/src/aarch64/ >> + * Adapted from the original at: >> + * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcmp.S >> */ > > ... but this referenced file says "SPDX-License-Identifier: MIT", and I > don't know when this relicensing is legitimate. We were told that since the copyright was fully assigned back to Arm in the move from Cortex Strings to Arm Optimized Routines, we are free to relicense it as we see fit, so a GPLv2 submission to Linux was still fine. Robin. > Thanks, > Mark. > >> >> #include <linux/linkage.h> >> #include <asm/assembler.h> >> >> -/* >> -* compare memory areas(when two memory areas' offset are different, >> -* alignment handled by the hardware) >> -* >> -* Parameters: >> -* x0 - const memory area 1 pointer >> -* x1 - const memory area 2 pointer >> -* x2 - the maximal compare byte length >> -* Returns: >> -* x0 - a compare result, maybe less than, equal to, or greater than ZERO >> -*/ >> +/* Assumptions: >> + * >> + * ARMv8-a, AArch64, unaligned accesses. >> + */ >> + >> +#define L(label) .L ## label >> >> /* Parameters and result. */ >> -src1 .req x0 >> -src2 .req x1 >> -limit .req x2 >> -result .req x0 >> +#define src1 x0 >> +#define src2 x1 >> +#define limit x2 >> +#define result w0 >> >> /* Internal variables. */ >> -data1 .req x3 >> -data1w .req w3 >> -data2 .req x4 >> -data2w .req w4 >> -has_nul .req x5 >> -diff .req x6 >> -endloop .req x7 >> -tmp1 .req x8 >> -tmp2 .req x9 >> -tmp3 .req x10 >> -pos .req x11 >> -limit_wd .req x12 >> -mask .req x13 >> +#define data1 x3 >> +#define data1w w3 >> +#define data1h x4 >> +#define data2 x5 >> +#define data2w w5 >> +#define data2h x6 >> +#define tmp1 x7 >> +#define tmp2 x8 >> >> SYM_FUNC_START_WEAK_PI(memcmp) >> - cbz limit, .Lret0 >> - eor tmp1, src1, src2 >> - tst tmp1, #7 >> - b.ne .Lmisaligned8 >> - ands tmp1, src1, #7 >> - b.ne .Lmutual_align >> - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ >> - lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ >> - /* >> - * The input source addresses are at alignment boundary. >> - * Directly compare eight bytes each time. >> - */ >> -.Lloop_aligned: >> - ldr data1, [src1], #8 >> - ldr data2, [src2], #8 >> -.Lstart_realigned: >> - subs limit_wd, limit_wd, #1 >> - eor diff, data1, data2 /* Non-zero if differences found. */ >> - csinv endloop, diff, xzr, cs /* Last Dword or differences. */ >> - cbz endloop, .Lloop_aligned >> + subs limit, limit, 8 >> + b.lo L(less8) >> >> - /* Not reached the limit, must have found a diff. */ >> - tbz limit_wd, #63, .Lnot_limit >> + ldr data1, [src1], 8 >> + ldr data2, [src2], 8 >> + cmp data1, data2 >> + b.ne L(return) >> >> - /* Limit % 8 == 0 => the diff is in the last 8 bytes. */ >> - ands limit, limit, #7 >> - b.eq .Lnot_limit >> - /* >> - * The remained bytes less than 8. It is needed to extract valid data >> - * from last eight bytes of the intended memory range. >> - */ >> - lsl limit, limit, #3 /* bytes-> bits. */ >> - mov mask, #~0 >> -CPU_BE( lsr mask, mask, limit ) >> -CPU_LE( lsl mask, mask, limit ) >> - bic data1, data1, mask >> - bic data2, data2, mask >> + subs limit, limit, 8 >> + b.gt L(more16) >> >> - orr diff, diff, mask >> - b .Lnot_limit >> + ldr data1, [src1, limit] >> + ldr data2, [src2, limit] >> + b L(return) >> >> -.Lmutual_align: >> - /* >> - * Sources are mutually aligned, but are not currently at an >> - * alignment boundary. Round down the addresses and then mask off >> - * the bytes that precede the start point. >> - */ >> - bic src1, src1, #7 >> - bic src2, src2, #7 >> - ldr data1, [src1], #8 >> - ldr data2, [src2], #8 >> - /* >> - * We can not add limit with alignment offset(tmp1) here. Since the >> - * addition probably make the limit overflown. >> - */ >> - sub limit_wd, limit, #1/*limit != 0, so no underflow.*/ >> - and tmp3, limit_wd, #7 >> - lsr limit_wd, limit_wd, #3 >> - add tmp3, tmp3, tmp1 >> - add limit_wd, limit_wd, tmp3, lsr #3 >> - add limit, limit, tmp1/* Adjust the limit for the extra. */ >> +L(more16): >> + ldr data1, [src1], 8 >> + ldr data2, [src2], 8 >> + cmp data1, data2 >> + bne L(return) >> >> - lsl tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/ >> - neg tmp1, tmp1/* Bits to alignment -64. */ >> - mov tmp2, #~0 >> - /*mask off the non-intended bytes before the start address.*/ >> -CPU_BE( lsl tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/ >> - /* Little-endian. Early bytes are at LSB. */ >> -CPU_LE( lsr tmp2, tmp2, tmp1 ) >> + /* Jump directly to comparing the last 16 bytes for 32 byte (or less) >> + strings. */ >> + subs limit, limit, 16 >> + b.ls L(last_bytes) >> >> - orr data1, data1, tmp2 >> - orr data2, data2, tmp2 >> - b .Lstart_realigned >> + /* We overlap loads between 0-32 bytes at either side of SRC1 when we >> + try to align, so limit it only to strings larger than 128 bytes. */ >> + cmp limit, 96 >> + b.ls L(loop16) >> >> - /*src1 and src2 have different alignment offset.*/ >> -.Lmisaligned8: >> - cmp limit, #8 >> - b.lo .Ltiny8proc /*limit < 8: compare byte by byte*/ >> + /* Align src1 and adjust src2 with bytes not yet done. */ >> + and tmp1, src1, 15 >> + add limit, limit, tmp1 >> + sub src1, src1, tmp1 >> + sub src2, src2, tmp1 >> >> - and tmp1, src1, #7 >> - neg tmp1, tmp1 >> - add tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/ >> - and tmp2, src2, #7 >> - neg tmp2, tmp2 >> - add tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/ >> - subs tmp3, tmp1, tmp2 >> - csel pos, tmp1, tmp2, hi /*Choose the maximum.*/ >> + /* Loop performing 16 bytes per iteration using aligned src1. >> + Limit is pre-decremented by 16 and must be larger than zero. >> + Exit if <= 16 bytes left to do or if the data is not equal. */ >> + .p2align 4 >> +L(loop16): >> + ldp data1, data1h, [src1], 16 >> + ldp data2, data2h, [src2], 16 >> + subs limit, limit, 16 >> + ccmp data1, data2, 0, hi >> + ccmp data1h, data2h, 0, eq >> + b.eq L(loop16) >> >> - sub limit, limit, pos >> - /*compare the proceeding bytes in the first 8 byte segment.*/ >> -.Ltinycmp: >> - ldrb data1w, [src1], #1 >> - ldrb data2w, [src2], #1 >> - subs pos, pos, #1 >> - ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ >> - b.eq .Ltinycmp >> - cbnz pos, 1f /*diff occurred before the last byte.*/ >> + cmp data1, data2 >> + bne L(return) >> + mov data1, data1h >> + mov data2, data2h >> + cmp data1, data2 >> + bne L(return) >> + >> + /* Compare last 1-16 bytes using unaligned access. */ >> +L(last_bytes): >> + add src1, src1, limit >> + add src2, src2, limit >> + ldp data1, data1h, [src1] >> + ldp data2, data2h, [src2] >> + cmp data1, data2 >> + bne L(return) >> + mov data1, data1h >> + mov data2, data2h >> + cmp data1, data2 >> + >> + /* Compare data bytes and set return value to 0, -1 or 1. */ >> +L(return): >> +#ifndef __AARCH64EB__ >> + rev data1, data1 >> + rev data2, data2 >> +#endif >> + cmp data1, data2 >> +L(ret_eq): >> + cset result, ne >> + cneg result, result, lo >> + ret >> + >> + .p2align 4 >> + /* Compare up to 8 bytes. Limit is [-8..-1]. */ >> +L(less8): >> + adds limit, limit, 4 >> + b.lo L(less4) >> + ldr data1w, [src1], 4 >> + ldr data2w, [src2], 4 >> cmp data1w, data2w >> - b.eq .Lstart_align >> -1: >> - sub result, data1, data2 >> + b.ne L(return) >> + sub limit, limit, 4 >> +L(less4): >> + adds limit, limit, 4 >> + beq L(ret_eq) >> +L(byte_loop): >> + ldrb data1w, [src1], 1 >> + ldrb data2w, [src2], 1 >> + subs limit, limit, 1 >> + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ >> + b.eq L(byte_loop) >> + sub result, data1w, data2w >> ret >> >> -.Lstart_align: >> - lsr limit_wd, limit, #3 >> - cbz limit_wd, .Lremain8 >> - >> - ands xzr, src1, #7 >> - b.eq .Lrecal_offset >> - /*process more leading bytes to make src1 aligned...*/ >> - add src1, src1, tmp3 /*backwards src1 to alignment boundary*/ >> - add src2, src2, tmp3 >> - sub limit, limit, tmp3 >> - lsr limit_wd, limit, #3 >> - cbz limit_wd, .Lremain8 >> - /*load 8 bytes from aligned SRC1..*/ >> - ldr data1, [src1], #8 >> - ldr data2, [src2], #8 >> - >> - subs limit_wd, limit_wd, #1 >> - eor diff, data1, data2 /*Non-zero if differences found.*/ >> - csinv endloop, diff, xzr, ne >> - cbnz endloop, .Lunequal_proc >> - /*How far is the current SRC2 from the alignment boundary...*/ >> - and tmp3, tmp3, #7 >> - >> -.Lrecal_offset:/*src1 is aligned now..*/ >> - neg pos, tmp3 >> -.Lloopcmp_proc: >> - /* >> - * Divide the eight bytes into two parts. First,backwards the src2 >> - * to an alignment boundary,load eight bytes and compare from >> - * the SRC2 alignment boundary. If all 8 bytes are equal,then start >> - * the second part's comparison. Otherwise finish the comparison. >> - * This special handle can garantee all the accesses are in the >> - * thread/task space in avoid to overrange access. >> - */ >> - ldr data1, [src1,pos] >> - ldr data2, [src2,pos] >> - eor diff, data1, data2 /* Non-zero if differences found. */ >> - cbnz diff, .Lnot_limit >> - >> - /*The second part process*/ >> - ldr data1, [src1], #8 >> - ldr data2, [src2], #8 >> - eor diff, data1, data2 /* Non-zero if differences found. */ >> - subs limit_wd, limit_wd, #1 >> - csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ >> - cbz endloop, .Lloopcmp_proc >> -.Lunequal_proc: >> - cbz diff, .Lremain8 >> - >> -/* There is difference occurred in the latest comparison. */ >> -.Lnot_limit: >> -/* >> -* For little endian,reverse the low significant equal bits into MSB,then >> -* following CLZ can find how many equal bits exist. >> -*/ >> -CPU_LE( rev diff, diff ) >> -CPU_LE( rev data1, data1 ) >> -CPU_LE( rev data2, data2 ) >> - >> - /* >> - * The MS-non-zero bit of DIFF marks either the first bit >> - * that is different, or the end of the significant data. >> - * Shifting left now will bring the critical information into the >> - * top bits. >> - */ >> - clz pos, diff >> - lsl data1, data1, pos >> - lsl data2, data2, pos >> - /* >> - * We need to zero-extend (char is unsigned) the value and then >> - * perform a signed subtraction. >> - */ >> - lsr data1, data1, #56 >> - sub result, data1, data2, lsr #56 >> - ret >> - >> -.Lremain8: >> - /* Limit % 8 == 0 =>. all data are equal.*/ >> - ands limit, limit, #7 >> - b.eq .Lret0 >> - >> -.Ltiny8proc: >> - ldrb data1w, [src1], #1 >> - ldrb data2w, [src2], #1 >> - subs limit, limit, #1 >> - >> - ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ >> - b.eq .Ltiny8proc >> - sub result, data1, data2 >> - ret >> -.Lret0: >> - mov result, #0 >> - ret >> SYM_FUNC_END_PI(memcmp) >> EXPORT_SYMBOL_NOKASAN(memcmp) >> -- >> 2.21.0.dirty >> >> >> _______________________________________________ >> linux-arm-kernel mailing list >> linux-arm-kernel@lists.infradead.org >> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
The 05/12/2021 14:38, Robin Murphy wrote: > [ Dropping Kai Shen who is now bouncing, adding Szabolcs just in case ] > On 2021-05-12 14:28, Mark Rutland wrote: > > On Tue, May 11, 2021 at 05:12:31PM +0100, Robin Murphy wrote: > > > From: Sam Tebbs <sam.tebbs@arm.com> > > > > > > Import the latest version of the former Cortex Strings - now > > > Arm Optimized Routines - memcmp function based on the upstream > > > code of string/aarch64/memcmp.S at commit e823e3a from > > > https://github.com/ARM-software/optimized-routines > > > > What's the licensing/copyright situation here? > > > > Because below... > > > > > > > > Signed-off-by: Sam Tebbs <sam.tebbs@arm.com> > > > [ rm: update attribution and commit message ] > > > Signed-off-by: Robin Murphy <robin.murphy@arm.com> > > > --- > > > arch/arm64/lib/memcmp.S | 330 ++++++++++++++-------------------------- > > > 1 file changed, 111 insertions(+), 219 deletions(-) > > > > > > diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S > > > index c0671e793ea9..498f0d9941d9 100644 > > > --- a/arch/arm64/lib/memcmp.S > > > +++ b/arch/arm64/lib/memcmp.S > > > @@ -1,247 +1,139 @@ > > > /* SPDX-License-Identifier: GPL-2.0-only */ > > > > ... this says GPL-2.0-only .... > > > > > /* > > > - * Copyright (C) 2013 ARM Ltd. > > > - * Copyright (C) 2013 Linaro. > > > + * Copyright (c) 2013-2020, Arm Limited. > > > * > > > - * This code is based on glibc cortex strings work originally authored by Linaro > > > - * be found @ > > > - * > > > - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ > > > - * files/head:/src/aarch64/ > > > + * Adapted from the original at: > > > + * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcmp.S > > > */ > > > > ... but this referenced file says "SPDX-License-Identifier: MIT", and I > > don't know when this relicensing is legitimate. > > We were told that since the copyright was fully assigned back to Arm in the > move from Cortex Strings to Arm Optimized Routines, we are free to relicense > it as we see fit, so a GPLv2 submission to Linux was still fine. yes, this code is fully owned by arm now, but the licensing is not clear to an outside observer. arm optimized-routines uses MIT license since that seemed to be widely accepted, but then we were asked to contribute the code under BSD3, LLVM, GPLv2,... which is a bit of a pain (since then arm has to get involved, we wanted others to take the code without us having to do anything).
On Wed, May 12, 2021 at 03:51:56PM +0100, Szabolcs Nagy wrote: > The 05/12/2021 14:38, Robin Murphy wrote: > > [ Dropping Kai Shen who is now bouncing, adding Szabolcs just in case ] > > On 2021-05-12 14:28, Mark Rutland wrote: > > > On Tue, May 11, 2021 at 05:12:31PM +0100, Robin Murphy wrote: > > > > From: Sam Tebbs <sam.tebbs@arm.com> > > > > > > > > Import the latest version of the former Cortex Strings - now > > > > Arm Optimized Routines - memcmp function based on the upstream > > > > code of string/aarch64/memcmp.S at commit e823e3a from > > > > https://github.com/ARM-software/optimized-routines > > > > > > What's the licensing/copyright situation here? > > > > > > Because below... > > > > --- a/arch/arm64/lib/memcmp.S > > > > +++ b/arch/arm64/lib/memcmp.S > > > > @@ -1,247 +1,139 @@ > > > > /* SPDX-License-Identifier: GPL-2.0-only */ > > > > > > ... this says GPL-2.0-only .... > > > > > > > /* > > > > - * Copyright (C) 2013 ARM Ltd. > > > > - * Copyright (C) 2013 Linaro. > > > > + * Copyright (c) 2013-2020, Arm Limited. > > > > * > > > > - * This code is based on glibc cortex strings work originally authored by Linaro > > > > - * be found @ > > > > - * > > > > - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ > > > > - * files/head:/src/aarch64/ > > > > + * Adapted from the original at: > > > > + * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcmp.S > > > > */ > > > > > > ... but this referenced file says "SPDX-License-Identifier: MIT", and I > > > don't know when this relicensing is legitimate. > > > > We were told that since the copyright was fully assigned back to Arm in the > > move from Cortex Strings to Arm Optimized Routines, we are free to relicense > > it as we see fit, so a GPLv2 submission to Linux was still fine. > > yes, this code is fully owned by arm now, but the > licensing is not clear to an outside observer. Thanks for confirming! It would be good if we could note this in the comment and/or the commit message, since otherwise this isn't clear to said outside observers. > arm optimized-routines uses MIT license since that > seemed to be widely accepted, but then we were asked > to contribute the code under BSD3, LLVM, GPLv2,... > which is a bit of a pain (since then arm has to get > involved, we wanted others to take the code without > us having to do anything). Sure, that makes sense. My main concern here is that we have the right to relicense this as GPLv2, since that avoids any pain points that come from mixed licensing. Thanks, Mark.
diff --git a/arch/arm64/lib/memcmp.S b/arch/arm64/lib/memcmp.S index c0671e793ea9..498f0d9941d9 100644 --- a/arch/arm64/lib/memcmp.S +++ b/arch/arm64/lib/memcmp.S @@ -1,247 +1,139 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2013 ARM Ltd. - * Copyright (C) 2013 Linaro. + * Copyright (c) 2013-2020, Arm Limited. * - * This code is based on glibc cortex strings work originally authored by Linaro - * be found @ - * - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ - * files/head:/src/aarch64/ + * Adapted from the original at: + * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcmp.S */ #include <linux/linkage.h> #include <asm/assembler.h> -/* -* compare memory areas(when two memory areas' offset are different, -* alignment handled by the hardware) -* -* Parameters: -* x0 - const memory area 1 pointer -* x1 - const memory area 2 pointer -* x2 - the maximal compare byte length -* Returns: -* x0 - a compare result, maybe less than, equal to, or greater than ZERO -*/ +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + */ + +#define L(label) .L ## label /* Parameters and result. */ -src1 .req x0 -src2 .req x1 -limit .req x2 -result .req x0 +#define src1 x0 +#define src2 x1 +#define limit x2 +#define result w0 /* Internal variables. */ -data1 .req x3 -data1w .req w3 -data2 .req x4 -data2w .req w4 -has_nul .req x5 -diff .req x6 -endloop .req x7 -tmp1 .req x8 -tmp2 .req x9 -tmp3 .req x10 -pos .req x11 -limit_wd .req x12 -mask .req x13 +#define data1 x3 +#define data1w w3 +#define data1h x4 +#define data2 x5 +#define data2w w5 +#define data2h x6 +#define tmp1 x7 +#define tmp2 x8 SYM_FUNC_START_WEAK_PI(memcmp) - cbz limit, .Lret0 - eor tmp1, src1, src2 - tst tmp1, #7 - b.ne .Lmisaligned8 - ands tmp1, src1, #7 - b.ne .Lmutual_align - sub limit_wd, limit, #1 /* limit != 0, so no underflow. */ - lsr limit_wd, limit_wd, #3 /* Convert to Dwords. */ - /* - * The input source addresses are at alignment boundary. - * Directly compare eight bytes each time. - */ -.Lloop_aligned: - ldr data1, [src1], #8 - ldr data2, [src2], #8 -.Lstart_realigned: - subs limit_wd, limit_wd, #1 - eor diff, data1, data2 /* Non-zero if differences found. */ - csinv endloop, diff, xzr, cs /* Last Dword or differences. */ - cbz endloop, .Lloop_aligned + subs limit, limit, 8 + b.lo L(less8) - /* Not reached the limit, must have found a diff. */ - tbz limit_wd, #63, .Lnot_limit + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + b.ne L(return) - /* Limit % 8 == 0 => the diff is in the last 8 bytes. */ - ands limit, limit, #7 - b.eq .Lnot_limit - /* - * The remained bytes less than 8. It is needed to extract valid data - * from last eight bytes of the intended memory range. - */ - lsl limit, limit, #3 /* bytes-> bits. */ - mov mask, #~0 -CPU_BE( lsr mask, mask, limit ) -CPU_LE( lsl mask, mask, limit ) - bic data1, data1, mask - bic data2, data2, mask + subs limit, limit, 8 + b.gt L(more16) - orr diff, diff, mask - b .Lnot_limit + ldr data1, [src1, limit] + ldr data2, [src2, limit] + b L(return) -.Lmutual_align: - /* - * Sources are mutually aligned, but are not currently at an - * alignment boundary. Round down the addresses and then mask off - * the bytes that precede the start point. - */ - bic src1, src1, #7 - bic src2, src2, #7 - ldr data1, [src1], #8 - ldr data2, [src2], #8 - /* - * We can not add limit with alignment offset(tmp1) here. Since the - * addition probably make the limit overflown. - */ - sub limit_wd, limit, #1/*limit != 0, so no underflow.*/ - and tmp3, limit_wd, #7 - lsr limit_wd, limit_wd, #3 - add tmp3, tmp3, tmp1 - add limit_wd, limit_wd, tmp3, lsr #3 - add limit, limit, tmp1/* Adjust the limit for the extra. */ +L(more16): + ldr data1, [src1], 8 + ldr data2, [src2], 8 + cmp data1, data2 + bne L(return) - lsl tmp1, tmp1, #3/* Bytes beyond alignment -> bits.*/ - neg tmp1, tmp1/* Bits to alignment -64. */ - mov tmp2, #~0 - /*mask off the non-intended bytes before the start address.*/ -CPU_BE( lsl tmp2, tmp2, tmp1 )/*Big-endian.Early bytes are at MSB*/ - /* Little-endian. Early bytes are at LSB. */ -CPU_LE( lsr tmp2, tmp2, tmp1 ) + /* Jump directly to comparing the last 16 bytes for 32 byte (or less) + strings. */ + subs limit, limit, 16 + b.ls L(last_bytes) - orr data1, data1, tmp2 - orr data2, data2, tmp2 - b .Lstart_realigned + /* We overlap loads between 0-32 bytes at either side of SRC1 when we + try to align, so limit it only to strings larger than 128 bytes. */ + cmp limit, 96 + b.ls L(loop16) - /*src1 and src2 have different alignment offset.*/ -.Lmisaligned8: - cmp limit, #8 - b.lo .Ltiny8proc /*limit < 8: compare byte by byte*/ + /* Align src1 and adjust src2 with bytes not yet done. */ + and tmp1, src1, 15 + add limit, limit, tmp1 + sub src1, src1, tmp1 + sub src2, src2, tmp1 - and tmp1, src1, #7 - neg tmp1, tmp1 - add tmp1, tmp1, #8/*valid length in the first 8 bytes of src1*/ - and tmp2, src2, #7 - neg tmp2, tmp2 - add tmp2, tmp2, #8/*valid length in the first 8 bytes of src2*/ - subs tmp3, tmp1, tmp2 - csel pos, tmp1, tmp2, hi /*Choose the maximum.*/ + /* Loop performing 16 bytes per iteration using aligned src1. + Limit is pre-decremented by 16 and must be larger than zero. + Exit if <= 16 bytes left to do or if the data is not equal. */ + .p2align 4 +L(loop16): + ldp data1, data1h, [src1], 16 + ldp data2, data2h, [src2], 16 + subs limit, limit, 16 + ccmp data1, data2, 0, hi + ccmp data1h, data2h, 0, eq + b.eq L(loop16) - sub limit, limit, pos - /*compare the proceeding bytes in the first 8 byte segment.*/ -.Ltinycmp: - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs pos, pos, #1 - ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ - b.eq .Ltinycmp - cbnz pos, 1f /*diff occurred before the last byte.*/ + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + bne L(return) + + /* Compare last 1-16 bytes using unaligned access. */ +L(last_bytes): + add src1, src1, limit + add src2, src2, limit + ldp data1, data1h, [src1] + ldp data2, data2h, [src2] + cmp data1, data2 + bne L(return) + mov data1, data1h + mov data2, data2h + cmp data1, data2 + + /* Compare data bytes and set return value to 0, -1 or 1. */ +L(return): +#ifndef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + cmp data1, data2 +L(ret_eq): + cset result, ne + cneg result, result, lo + ret + + .p2align 4 + /* Compare up to 8 bytes. Limit is [-8..-1]. */ +L(less8): + adds limit, limit, 4 + b.lo L(less4) + ldr data1w, [src1], 4 + ldr data2w, [src2], 4 cmp data1w, data2w - b.eq .Lstart_align -1: - sub result, data1, data2 + b.ne L(return) + sub limit, limit, 4 +L(less4): + adds limit, limit, 4 + beq L(ret_eq) +L(byte_loop): + ldrb data1w, [src1], 1 + ldrb data2w, [src2], 1 + subs limit, limit, 1 + ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */ + b.eq L(byte_loop) + sub result, data1w, data2w ret -.Lstart_align: - lsr limit_wd, limit, #3 - cbz limit_wd, .Lremain8 - - ands xzr, src1, #7 - b.eq .Lrecal_offset - /*process more leading bytes to make src1 aligned...*/ - add src1, src1, tmp3 /*backwards src1 to alignment boundary*/ - add src2, src2, tmp3 - sub limit, limit, tmp3 - lsr limit_wd, limit, #3 - cbz limit_wd, .Lremain8 - /*load 8 bytes from aligned SRC1..*/ - ldr data1, [src1], #8 - ldr data2, [src2], #8 - - subs limit_wd, limit_wd, #1 - eor diff, data1, data2 /*Non-zero if differences found.*/ - csinv endloop, diff, xzr, ne - cbnz endloop, .Lunequal_proc - /*How far is the current SRC2 from the alignment boundary...*/ - and tmp3, tmp3, #7 - -.Lrecal_offset:/*src1 is aligned now..*/ - neg pos, tmp3 -.Lloopcmp_proc: - /* - * Divide the eight bytes into two parts. First,backwards the src2 - * to an alignment boundary,load eight bytes and compare from - * the SRC2 alignment boundary. If all 8 bytes are equal,then start - * the second part's comparison. Otherwise finish the comparison. - * This special handle can garantee all the accesses are in the - * thread/task space in avoid to overrange access. - */ - ldr data1, [src1,pos] - ldr data2, [src2,pos] - eor diff, data1, data2 /* Non-zero if differences found. */ - cbnz diff, .Lnot_limit - - /*The second part process*/ - ldr data1, [src1], #8 - ldr data2, [src2], #8 - eor diff, data1, data2 /* Non-zero if differences found. */ - subs limit_wd, limit_wd, #1 - csinv endloop, diff, xzr, ne/*if limit_wd is 0,will finish the cmp*/ - cbz endloop, .Lloopcmp_proc -.Lunequal_proc: - cbz diff, .Lremain8 - -/* There is difference occurred in the latest comparison. */ -.Lnot_limit: -/* -* For little endian,reverse the low significant equal bits into MSB,then -* following CLZ can find how many equal bits exist. -*/ -CPU_LE( rev diff, diff ) -CPU_LE( rev data1, data1 ) -CPU_LE( rev data2, data2 ) - - /* - * The MS-non-zero bit of DIFF marks either the first bit - * that is different, or the end of the significant data. - * Shifting left now will bring the critical information into the - * top bits. - */ - clz pos, diff - lsl data1, data1, pos - lsl data2, data2, pos - /* - * We need to zero-extend (char is unsigned) the value and then - * perform a signed subtraction. - */ - lsr data1, data1, #56 - sub result, data1, data2, lsr #56 - ret - -.Lremain8: - /* Limit % 8 == 0 =>. all data are equal.*/ - ands limit, limit, #7 - b.eq .Lret0 - -.Ltiny8proc: - ldrb data1w, [src1], #1 - ldrb data2w, [src2], #1 - subs limit, limit, #1 - - ccmp data1w, data2w, #0, ne /* NZCV = 0b0000. */ - b.eq .Ltiny8proc - sub result, data1, data2 - ret -.Lret0: - mov result, #0 - ret SYM_FUNC_END_PI(memcmp) EXPORT_SYMBOL_NOKASAN(memcmp)