Message ID | 20210617152754.17960-2-mcroce@linux.microsoft.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | riscv: optimized mem* functions | expand |
Hi Matteo, Thank you for the patch! Perhaps something to improve: [auto build test WARNING on linux/master] [also build test WARNING on linus/master v5.13-rc6 next-20210617] [cannot apply to atish-riscv-linux/topo_v3] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Matteo-Croce/riscv-optimized-mem-functions/20210617-232934 base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git dd860052c99b1e088352bdd4fb7aef46f8d2ef47 config: riscv-randconfig-r031-20210618 (attached as .config) compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 64720f57bea6a6bf033feef4a5751ab9c0c3b401) reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # install riscv cross compiling tool for clang build # apt-get install binutils-riscv64-linux-gnu # https://github.com/0day-ci/linux/commit/c35a3474b6782645ff0695db1f30be0e8123c131 git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Matteo-Croce/riscv-optimized-mem-functions/20210617-232934 git checkout c35a3474b6782645ff0695db1f30be0e8123c131 # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=riscv If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot <lkp@intel.com> All warnings (new ones prefixed by >>): >> arch/riscv/lib/string.c:90:57: warning: attribute declaration must precede definition [-Wignored-attributes] void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy); ^ include/linux/compiler_attributes.h:291:56: note: expanded from macro '__weak' #define __weak __attribute__((__weak__)) ^ include/linux/fortify-string.h:178:24: note: previous definition is here __FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size) ^ 1 warning generated. Kconfig warnings: (for reference only) WARNING: unmet direct dependencies detected for LOCKDEP Depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT && (FRAME_POINTER || MIPS || PPC || S390 || MICROBLAZE || ARM || ARC || X86) Selected by - PROVE_LOCKING && DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT - LOCK_STAT && DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT - DEBUG_LOCK_ALLOC && DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT vim +90 arch/riscv/lib/string.c 89 > 90 void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy); --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
On Thu, Jun 17, 2021 at 05:27:52PM +0200, Matteo Croce wrote: > +extern void *memcpy(void *dest, const void *src, size_t count); > +extern void *__memcpy(void *dest, const void *src, size_t count); No need for externs. > +++ b/arch/riscv/lib/string.c Nothing in her looks RISC-V specific. Why doesn't this go into lib/ so that other architectures can use it as well. > +#include <linux/module.h> I think you only need export.h. > +void *__memcpy(void *dest, const void *src, size_t count) > +{ > + const int bytes_long = BITS_PER_LONG / 8; > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS > + const int mask = bytes_long - 1; > + const int distance = (src - dest) & mask; > +#endif > + union const_types s = { .u8 = src }; > + union types d = { .u8 = dest }; > + > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS > + if (count < MIN_THRESHOLD) Using IS_ENABLED we can avoid a lot of the mess in this function. int distance = 0; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { if (count < MIN_THRESHOLD) goto copy_remainder; /* copy a byte at time until destination is aligned */ for (; count && d.uptr & mask; count--) *d.u8++ = *s.u8++; distance = (src - dest) & mask; } if (distance) { ... > + /* 32/64 bit wide copy from s to d. > + * d is aligned now but s is not, so read s alignment wise, > + * and do proper shift to get the right value. > + * Works only on Little Endian machines. > + */ Normal kernel comment style always start with a: /* > + for (next = s.ulong[0]; count >= bytes_long + mask; count -= bytes_long) { Please avoid the pointlessly overlong line. And (just as a matter of personal preference) I find for loop that don't actually use a single iterator rather confusing. Wjy not simply: next = s.ulong[0]; while (count >= bytes_long + mask) { ... count -= bytes_long; }
Hello Matteo and thanks for the patch, Στις 2021-06-17 18:27, Matteo Croce έγραψε: > > @@ -0,0 +1,91 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * String functions optimized for hardware which doesn't > + * handle unaligned memory accesses efficiently. > + * > + * Copyright (C) 2021 Matteo Croce > + */ > + > +#include <linux/types.h> > +#include <linux/module.h> > + > +/* Minimum size for a word copy to be convenient */ > +#define MIN_THRESHOLD (BITS_PER_LONG / 8 * 2) > + > +/* convenience union to avoid cast between different pointer types */ > +union types { > + u8 *u8; You are using a type as a name, I'd go with as_bytes/as_ulong/as_uptr which makes it easier for the reader to understand what you are trying to do. > + unsigned long *ulong; > + uintptr_t uptr; > +}; > + > +union const_types { > + const u8 *u8; > + unsigned long *ulong; > +}; > + I suggest you define those unions inside the function body, no one else is using them. > +void *__memcpy(void *dest, const void *src, size_t count) > +{ > + const int bytes_long = BITS_PER_LONG / 8; > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS > + const int mask = bytes_long - 1; > + const int distance = (src - dest) & mask; Why not unsigned ints ? > +#endif > + union const_types s = { .u8 = src }; > + union types d = { .u8 = dest }; > + > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS If you want to be compliant with memcpy you should check for overlapping regions here since "The memory areas must not overlap", and do nothing about it because according to POSIX this leads to undefined behavior. That's why recent libc implementations use memmove in any case (memcpy is an alias to memmove), which is the suggested approach. > + if (count < MIN_THRESHOLD) > + goto copy_remainder; > + > + /* copy a byte at time until destination is aligned */ > + for (; count && d.uptr & mask; count--) > + *d.u8++ = *s.u8++; > + You should check for !IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) here. > + if (distance) { > + unsigned long last, next; > + > + /* move s backward to the previous alignment boundary */ > + s.u8 -= distance; It'd help here to explain that since s is distance bytes ahead relative to d, and d reached the alignment boundary above, s is now aligned but the data needs to be shifted to compensate for distance, in order to do word-by-word copy. > + > + /* 32/64 bit wide copy from s to d. > + * d is aligned now but s is not, so read s alignment wise, > + * and do proper shift to get the right value. > + * Works only on Little Endian machines. > + */ This commend is misleading because s is aligned or else s.ulong[0]/[1] below would result an unaligned access. > + for (next = s.ulong[0]; count >= bytes_long + mask; count -= > bytes_long) { > + last = next; > + next = s.ulong[1]; > + > + d.ulong[0] = last >> (distance * 8) | > + next << ((bytes_long - distance) * 8); > + > + d.ulong++; > + s.ulong++; > + } > + > + /* restore s with the original offset */ > + s.u8 += distance; > + } else > +#endif > + { > + /* if the source and dest lower bits are the same, do a simple > + * 32/64 bit wide copy. > + */ A while() loop would make more sense here. > + for (; count >= bytes_long; count -= bytes_long) > + *d.ulong++ = *s.ulong++; > + } > + > + /* suppress warning when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y */ > + goto copy_remainder; > + > +copy_remainder: > + while (count--) > + *d.u8++ = *s.u8++; > + > + return dest; > +} > +EXPORT_SYMBOL(__memcpy); > + > +void *memcpy(void *dest, const void *src, size_t count) __weak > __alias(__memcpy); > +EXPORT_SYMBOL(memcpy);
From: Christoph Hellwig > Sent: 21 June 2021 15:27 ... > > + for (next = s.ulong[0]; count >= bytes_long + mask; count -= bytes_long) { > > Please avoid the pointlessly overlong line. And (just as a matter of > personal preference) I find for loop that don't actually use a single > iterator rather confusing. Wjy not simply: > > next = s.ulong[0]; > while (count >= bytes_long + mask) { > ... > count -= bytes_long; > } My fist attack on long 'for' statements is just to move the initialisation to the previous line. Then make sure there is nothing in the comparison that needs to be calculated every iteration. I suspect you can subtract 'mask' from 'count'. Giving: count -= mask; next = s.ulong[0]; for (;; count > bytes_long; count -= bytes_long) { Next is to shorten the variable names! David - Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK Registration No: 1397386 (Wales)
On Mon, Jun 21, 2021 at 4:26 PM Christoph Hellwig <hch@infradead.org> wrote: > > On Thu, Jun 17, 2021 at 05:27:52PM +0200, Matteo Croce wrote: > > +extern void *memcpy(void *dest, const void *src, size_t count); > > +extern void *__memcpy(void *dest, const void *src, size_t count); > > No need for externs. > Right. > > +++ b/arch/riscv/lib/string.c > > Nothing in her looks RISC-V specific. Why doesn't this go into lib/ so > that other architectures can use it as well. > Technically it could go into lib/ and be generic. If you think it's worth it, I have just to handle the different left/right shift because of endianness. > > +#include <linux/module.h> > > I think you only need export.h. > Nice. > > +void *__memcpy(void *dest, const void *src, size_t count) > > +{ > > + const int bytes_long = BITS_PER_LONG / 8; > > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS > > + const int mask = bytes_long - 1; > > + const int distance = (src - dest) & mask; > > +#endif > > + union const_types s = { .u8 = src }; > > + union types d = { .u8 = dest }; > > + > > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS > > + if (count < MIN_THRESHOLD) > > Using IS_ENABLED we can avoid a lot of the mess in this > function. > > int distance = 0; > > if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { > if (count < MIN_THRESHOLD) > goto copy_remainder; > > /* copy a byte at time until destination is aligned */ > for (; count && d.uptr & mask; count--) > *d.u8++ = *s.u8++; > distance = (src - dest) & mask; > } > Cool. What about putting this check in the very start: if (count < MIN_THRESHOLD) goto copy_remainder; And since count is at least twice bytes_long, remove count from the check below? Also, setting distance after d is aligned is as simple as getting the lower bits of s: if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { /* Copy a byte at time until destination is aligned. */ for (; d.uptr & mask; count--) *d.u8++ = *s.u8++; distance = s.uptr & mask; } > if (distance) { > ... > > > + /* 32/64 bit wide copy from s to d. > > + * d is aligned now but s is not, so read s alignment wise, > > + * and do proper shift to get the right value. > > + * Works only on Little Endian machines. > > + */ > > Normal kernel comment style always start with a: > Right, I was used to netdev ones :) > /* > > > > + for (next = s.ulong[0]; count >= bytes_long + mask; count -= bytes_long) { > > Please avoid the pointlessly overlong line. And (just as a matter of > personal preference) I find for loop that don't actually use a single > iterator rather confusing. Wjy not simply: > > next = s.ulong[0]; > while (count >= bytes_long + mask) { > ... > count -= bytes_long; > } My fault, in a previous version it was: next = s.ulong[0]; for (; count >= bytes_long + mask; count -= bytes_long) { So to have a single `count` counter for the loop. Regards,
On Tue, Jun 22, 2021 at 10:19 AM David Laight <David.Laight@aculab.com> wrote: > > From: Christoph Hellwig > > Sent: 21 June 2021 15:27 > ... > > > + for (next = s.ulong[0]; count >= bytes_long + mask; count -= bytes_long) { > > > > Please avoid the pointlessly overlong line. And (just as a matter of > > personal preference) I find for loop that don't actually use a single > > iterator rather confusing. Wjy not simply: > > > > next = s.ulong[0]; > > while (count >= bytes_long + mask) { > > ... > > count -= bytes_long; > > } > > My fist attack on long 'for' statements is just to move the > initialisation to the previous line. > Then make sure there is nothing in the comparison that needs > to be calculated every iteration. > I suspect you can subtract 'mask' from 'count'. > Giving: > count -= mask; > next = s.ulong[0]; > for (;; count > bytes_long; count -= bytes_long) { > This way we'll lose the remainder, as count is used at the end to copy the leftover. Anyway, both bytes_long and mask are constant, I doubt they get summed at every cycle. > Next is to shorten the variable names! > > David >
On Tue, Jun 22, 2021 at 2:15 AM Nick Kossifidis <mick@ics.forth.gr> wrote: > > Hello Matteo and thanks for the patch, > > Στις 2021-06-17 18:27, Matteo Croce έγραψε: > > > > @@ -0,0 +1,91 @@ > > +// SPDX-License-Identifier: GPL-2.0-only > > +/* > > + * String functions optimized for hardware which doesn't > > + * handle unaligned memory accesses efficiently. > > + * > > + * Copyright (C) 2021 Matteo Croce > > + */ > > + > > +#include <linux/types.h> > > +#include <linux/module.h> > > + > > +/* Minimum size for a word copy to be convenient */ > > +#define MIN_THRESHOLD (BITS_PER_LONG / 8 * 2) > > + > > +/* convenience union to avoid cast between different pointer types */ > > +union types { > > + u8 *u8; > > You are using a type as a name, I'd go with as_bytes/as_ulong/as_uptr > which makes it easier for the reader to understand what you are trying > to do. > Makes sense > > + unsigned long *ulong; > > + uintptr_t uptr; > > +}; > > + > > +union const_types { > > + const u8 *u8; > > + unsigned long *ulong; > > +}; > > + > > I suggest you define those unions inside the function body, no one else > is using them. > They will be used in memset(), in patch 3/3 > > +void *__memcpy(void *dest, const void *src, size_t count) > > +{ > > + const int bytes_long = BITS_PER_LONG / 8; > > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS > > + const int mask = bytes_long - 1; > > + const int distance = (src - dest) & mask; > > Why not unsigned ints ? > Ok. > > +#endif > > + union const_types s = { .u8 = src }; > > + union types d = { .u8 = dest }; > > + > > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS > > If you want to be compliant with memcpy you should check for overlapping > regions here since "The memory areas must not overlap", and do nothing > about it because according to POSIX this leads to undefined behavior. > That's why recent libc implementations use memmove in any case (memcpy > is an alias to memmove), which is the suggested approach. > Mmm which memcpy arch implementation does this check? I guess that noone is currently doing it. > > + if (count < MIN_THRESHOLD) > > + goto copy_remainder; > > + > > + /* copy a byte at time until destination is aligned */ > > + for (; count && d.uptr & mask; count--) > > + *d.u8++ = *s.u8++; > > + > > You should check for !IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) here. > I tought that only Little Endian RISC-V machines were supported in Linux. Should I add a BUILD_BUG_ON()? Anyway, if this is going in generic lib/, I'll take care of the endianness. > > + if (distance) { > > + unsigned long last, next; > > + > > + /* move s backward to the previous alignment boundary */ > > + s.u8 -= distance; > > It'd help here to explain that since s is distance bytes ahead relative > to d, and d reached the alignment boundary above, s is now aligned but > the data needs to be shifted to compensate for distance, in order to do > word-by-word copy. > > > + > > + /* 32/64 bit wide copy from s to d. > > + * d is aligned now but s is not, so read s alignment wise, > > + * and do proper shift to get the right value. > > + * Works only on Little Endian machines. > > + */ > > This commend is misleading because s is aligned or else s.ulong[0]/[1] > below would result an unaligned access. > Yes, those two comments should be rephrased, merged and put above. > > + for (next = s.ulong[0]; count >= bytes_long + mask; count -= > > bytes_long) { > > + last = next; > > + next = s.ulong[1]; > > + > > + d.ulong[0] = last >> (distance * 8) | > > + next << ((bytes_long - distance) * 8); > > + > > + d.ulong++; > > + s.ulong++; > > + } > > + > > + /* restore s with the original offset */ > > + s.u8 += distance; > > + } else > > +#endif > > + { > > + /* if the source and dest lower bits are the same, do a simple > > + * 32/64 bit wide copy. > > + */ > > A while() loop would make more sense here. > Ok. > > + for (; count >= bytes_long; count -= bytes_long) > > + *d.ulong++ = *s.ulong++; > > + } > > + > > + /* suppress warning when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y */ > > + goto copy_remainder; > > + > > +copy_remainder: > > + while (count--) > > + *d.u8++ = *s.u8++; > > + > > + return dest; > > +} > > +EXPORT_SYMBOL(__memcpy); > > + > > +void *memcpy(void *dest, const void *src, size_t count) __weak > > __alias(__memcpy); > > +EXPORT_SYMBOL(memcpy); Regards,
Στις 2021-06-23 02:35, Matteo Croce έγραψε: >> >> If you want to be compliant with memcpy you should check for >> overlapping >> regions here since "The memory areas must not overlap", and do nothing >> about it because according to POSIX this leads to undefined behavior. >> That's why recent libc implementations use memmove in any case (memcpy >> is an alias to memmove), which is the suggested approach. >> > > Mmm which memcpy arch implementation does this check? > I guess that noone is currently doing it. > Yup because even if they did the wouldn't know what to do about it since POSIX leaves this as undefined behavior. So instead of using memcpy it's suggested that people use memmove that can handle overlapping regions, and because we can't patch the rest of the kernel to only use memmove (or the rest of the programs if we were a libc), the idea is to just alias memcpy to memmove (BTW Torvalds also thinks this is a good idea: https://bugzilla.redhat.com/show_bug.cgi?id=638477#c132).
diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h index 909049366555..6b5d6fc3eab4 100644 --- a/arch/riscv/include/asm/string.h +++ b/arch/riscv/include/asm/string.h @@ -12,9 +12,13 @@ #define __HAVE_ARCH_MEMSET extern asmlinkage void *memset(void *, int, size_t); extern asmlinkage void *__memset(void *, int, size_t); + +#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE #define __HAVE_ARCH_MEMCPY -extern asmlinkage void *memcpy(void *, const void *, size_t); -extern asmlinkage void *__memcpy(void *, const void *, size_t); +extern void *memcpy(void *dest, const void *src, size_t count); +extern void *__memcpy(void *dest, const void *src, size_t count); +#endif + #define __HAVE_ARCH_MEMMOVE extern asmlinkage void *memmove(void *, const void *, size_t); extern asmlinkage void *__memmove(void *, const void *, size_t); diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c index 5ab1c7e1a6ed..3f6d512a5b97 100644 --- a/arch/riscv/kernel/riscv_ksyms.c +++ b/arch/riscv/kernel/riscv_ksyms.c @@ -10,8 +10,6 @@ * Assembly functions that may be used (directly or indirectly) by modules */ EXPORT_SYMBOL(memset); -EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(__memset); -EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(__memmove); diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile index 25d5c9664e57..2ffe85d4baee 100644 --- a/arch/riscv/lib/Makefile +++ b/arch/riscv/lib/Makefile @@ -1,9 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only lib-y += delay.o -lib-y += memcpy.o lib-y += memset.o lib-y += memmove.o lib-$(CONFIG_MMU) += uaccess.o lib-$(CONFIG_64BIT) += tishift.o +lib-$(CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE) += string.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/riscv/lib/memcpy.S b/arch/riscv/lib/memcpy.S deleted file mode 100644 index 51ab716253fa..000000000000 --- a/arch/riscv/lib/memcpy.S +++ /dev/null @@ -1,108 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2013 Regents of the University of California - */ - -#include <linux/linkage.h> -#include <asm/asm.h> - -/* void *memcpy(void *, const void *, size_t) */ -ENTRY(__memcpy) -WEAK(memcpy) - move t6, a0 /* Preserve return value */ - - /* Defer to byte-oriented copy for small sizes */ - sltiu a3, a2, 128 - bnez a3, 4f - /* Use word-oriented copy only if low-order bits match */ - andi a3, t6, SZREG-1 - andi a4, a1, SZREG-1 - bne a3, a4, 4f - - beqz a3, 2f /* Skip if already aligned */ - /* - * Round to nearest double word-aligned address - * greater than or equal to start address - */ - andi a3, a1, ~(SZREG-1) - addi a3, a3, SZREG - /* Handle initial misalignment */ - sub a4, a3, a1 -1: - lb a5, 0(a1) - addi a1, a1, 1 - sb a5, 0(t6) - addi t6, t6, 1 - bltu a1, a3, 1b - sub a2, a2, a4 /* Update count */ - -2: - andi a4, a2, ~((16*SZREG)-1) - beqz a4, 4f - add a3, a1, a4 -3: - REG_L a4, 0(a1) - REG_L a5, SZREG(a1) - REG_L a6, 2*SZREG(a1) - REG_L a7, 3*SZREG(a1) - REG_L t0, 4*SZREG(a1) - REG_L t1, 5*SZREG(a1) - REG_L t2, 6*SZREG(a1) - REG_L t3, 7*SZREG(a1) - REG_L t4, 8*SZREG(a1) - REG_L t5, 9*SZREG(a1) - REG_S a4, 0(t6) - REG_S a5, SZREG(t6) - REG_S a6, 2*SZREG(t6) - REG_S a7, 3*SZREG(t6) - REG_S t0, 4*SZREG(t6) - REG_S t1, 5*SZREG(t6) - REG_S t2, 6*SZREG(t6) - REG_S t3, 7*SZREG(t6) - REG_S t4, 8*SZREG(t6) - REG_S t5, 9*SZREG(t6) - REG_L a4, 10*SZREG(a1) - REG_L a5, 11*SZREG(a1) - REG_L a6, 12*SZREG(a1) - REG_L a7, 13*SZREG(a1) - REG_L t0, 14*SZREG(a1) - REG_L t1, 15*SZREG(a1) - addi a1, a1, 16*SZREG - REG_S a4, 10*SZREG(t6) - REG_S a5, 11*SZREG(t6) - REG_S a6, 12*SZREG(t6) - REG_S a7, 13*SZREG(t6) - REG_S t0, 14*SZREG(t6) - REG_S t1, 15*SZREG(t6) - addi t6, t6, 16*SZREG - bltu a1, a3, 3b - andi a2, a2, (16*SZREG)-1 /* Update count */ - -4: - /* Handle trailing misalignment */ - beqz a2, 6f - add a3, a1, a2 - - /* Use word-oriented copy if co-aligned to word boundary */ - or a5, a1, t6 - or a5, a5, a3 - andi a5, a5, 3 - bnez a5, 5f -7: - lw a4, 0(a1) - addi a1, a1, 4 - sw a4, 0(t6) - addi t6, t6, 4 - bltu a1, a3, 7b - - ret - -5: - lb a4, 0(a1) - addi a1, a1, 1 - sb a4, 0(t6) - addi t6, t6, 1 - bltu a1, a3, 5b -6: - ret -END(__memcpy) diff --git a/arch/riscv/lib/string.c b/arch/riscv/lib/string.c new file mode 100644 index 000000000000..e48a79a045d8 --- /dev/null +++ b/arch/riscv/lib/string.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * String functions optimized for hardware which doesn't + * handle unaligned memory accesses efficiently. + * + * Copyright (C) 2021 Matteo Croce + */ + +#include <linux/types.h> +#include <linux/module.h> + +/* Minimum size for a word copy to be convenient */ +#define MIN_THRESHOLD (BITS_PER_LONG / 8 * 2) + +/* convenience union to avoid cast between different pointer types */ +union types { + u8 *u8; + unsigned long *ulong; + uintptr_t uptr; +}; + +union const_types { + const u8 *u8; + unsigned long *ulong; +}; + +void *__memcpy(void *dest, const void *src, size_t count) +{ + const int bytes_long = BITS_PER_LONG / 8; +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + const int mask = bytes_long - 1; + const int distance = (src - dest) & mask; +#endif + union const_types s = { .u8 = src }; + union types d = { .u8 = dest }; + +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + if (count < MIN_THRESHOLD) + goto copy_remainder; + + /* copy a byte at time until destination is aligned */ + for (; count && d.uptr & mask; count--) + *d.u8++ = *s.u8++; + + if (distance) { + unsigned long last, next; + + /* move s backward to the previous alignment boundary */ + s.u8 -= distance; + + /* 32/64 bit wide copy from s to d. + * d is aligned now but s is not, so read s alignment wise, + * and do proper shift to get the right value. + * Works only on Little Endian machines. + */ + for (next = s.ulong[0]; count >= bytes_long + mask; count -= bytes_long) { + last = next; + next = s.ulong[1]; + + d.ulong[0] = last >> (distance * 8) | + next << ((bytes_long - distance) * 8); + + d.ulong++; + s.ulong++; + } + + /* restore s with the original offset */ + s.u8 += distance; + } else +#endif + { + /* if the source and dest lower bits are the same, do a simple + * 32/64 bit wide copy. + */ + for (; count >= bytes_long; count -= bytes_long) + *d.ulong++ = *s.ulong++; + } + + /* suppress warning when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y */ + goto copy_remainder; + +copy_remainder: + while (count--) + *d.u8++ = *s.u8++; + + return dest; +} +EXPORT_SYMBOL(__memcpy); + +void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy); +EXPORT_SYMBOL(memcpy);