diff mbox series

[v3,1/3] riscv: optimized memcpy

Message ID 20210617152754.17960-2-mcroce@linux.microsoft.com (mailing list archive)
State New, archived
Headers show
Series riscv: optimized mem* functions | expand

Commit Message

Matteo Croce June 17, 2021, 3:27 p.m. UTC
From: Matteo Croce <mcroce@microsoft.com>

Write a C version of memcpy() which uses the biggest data size allowed,
without generating unaligned accesses.

The procedure is made of three steps:
First copy data one byte at time until the destination buffer is aligned
to a long boundary.
Then copy the data one long at time shifting the current and the next u8
to compose a long at every cycle.
Finally, copy the remainder one byte at time.

On a BeagleV, the TCP RX throughput increased by 45%:

before:

$ iperf3 -c beaglev
Connecting to host beaglev, port 5201
[  5] local 192.168.85.6 port 44840 connected to 192.168.85.48 port 5201
[ ID] Interval           Transfer     Bitrate         Retr  Cwnd
[  5]   0.00-1.00   sec  76.4 MBytes   641 Mbits/sec   27    624 KBytes
[  5]   1.00-2.00   sec  72.5 MBytes   608 Mbits/sec    0    708 KBytes
[  5]   2.00-3.00   sec  73.8 MBytes   619 Mbits/sec   10    451 KBytes
[  5]   3.00-4.00   sec  72.5 MBytes   608 Mbits/sec    0    564 KBytes
[  5]   4.00-5.00   sec  73.8 MBytes   619 Mbits/sec    0    658 KBytes
[  5]   5.00-6.00   sec  73.8 MBytes   619 Mbits/sec   14    522 KBytes
[  5]   6.00-7.00   sec  73.8 MBytes   619 Mbits/sec    0    621 KBytes
[  5]   7.00-8.00   sec  72.5 MBytes   608 Mbits/sec    0    706 KBytes
[  5]   8.00-9.00   sec  73.8 MBytes   619 Mbits/sec   20    580 KBytes
[  5]   9.00-10.00  sec  73.8 MBytes   619 Mbits/sec    0    672 KBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-10.00  sec   736 MBytes   618 Mbits/sec   71             sender
[  5]   0.00-10.01  sec   733 MBytes   615 Mbits/sec                  receiver

after:

$ iperf3 -c beaglev
Connecting to host beaglev, port 5201
[  5] local 192.168.85.6 port 44864 connected to 192.168.85.48 port 5201
[ ID] Interval           Transfer     Bitrate         Retr  Cwnd
[  5]   0.00-1.00   sec   109 MBytes   912 Mbits/sec   48    559 KBytes
[  5]   1.00-2.00   sec   108 MBytes   902 Mbits/sec    0    690 KBytes
[  5]   2.00-3.00   sec   106 MBytes   891 Mbits/sec   36    396 KBytes
[  5]   3.00-4.00   sec   108 MBytes   902 Mbits/sec    0    567 KBytes
[  5]   4.00-5.00   sec   106 MBytes   891 Mbits/sec    0    699 KBytes
[  5]   5.00-6.00   sec   106 MBytes   891 Mbits/sec   32    414 KBytes
[  5]   6.00-7.00   sec   106 MBytes   891 Mbits/sec    0    583 KBytes
[  5]   7.00-8.00   sec   106 MBytes   891 Mbits/sec    0    708 KBytes
[  5]   8.00-9.00   sec   106 MBytes   891 Mbits/sec   28    433 KBytes
[  5]   9.00-10.00  sec   108 MBytes   902 Mbits/sec    0    591 KBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bitrate         Retr
[  5]   0.00-10.00  sec  1.04 GBytes   897 Mbits/sec  144             sender
[  5]   0.00-10.01  sec  1.04 GBytes   894 Mbits/sec                  receiver

And the decreased CPU time of the memcpy() is observable with perf top.
This is the `perf top -Ue task-clock` output when doing the test:

before:

Overhead  Shared O  Symbol
  42.22%  [kernel]  [k] memcpy
  35.00%  [kernel]  [k] __asm_copy_to_user
   3.50%  [kernel]  [k] sifive_l2_flush64_range
   2.30%  [kernel]  [k] stmmac_napi_poll_rx
   1.11%  [kernel]  [k] memset

after:

Overhead  Shared O  Symbol
  45.69%  [kernel]  [k] __asm_copy_to_user
  29.06%  [kernel]  [k] memcpy
   4.09%  [kernel]  [k] sifive_l2_flush64_range
   2.77%  [kernel]  [k] stmmac_napi_poll_rx
   1.24%  [kernel]  [k] memset

Signed-off-by: Matteo Croce <mcroce@microsoft.com>
---
 arch/riscv/include/asm/string.h |   8 ++-
 arch/riscv/kernel/riscv_ksyms.c |   2 -
 arch/riscv/lib/Makefile         |   2 +-
 arch/riscv/lib/memcpy.S         | 108 --------------------------------
 arch/riscv/lib/string.c         |  91 +++++++++++++++++++++++++++
 5 files changed, 98 insertions(+), 113 deletions(-)
 delete mode 100644 arch/riscv/lib/memcpy.S
 create mode 100644 arch/riscv/lib/string.c

Comments

kernel test robot June 18, 2021, 2:06 p.m. UTC | #1
Hi Matteo,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linux/master]
[also build test WARNING on linus/master v5.13-rc6 next-20210617]
[cannot apply to atish-riscv-linux/topo_v3]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Matteo-Croce/riscv-optimized-mem-functions/20210617-232934
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git dd860052c99b1e088352bdd4fb7aef46f8d2ef47
config: riscv-randconfig-r031-20210618 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 64720f57bea6a6bf033feef4a5751ab9c0c3b401)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install riscv cross compiling tool for clang build
        # apt-get install binutils-riscv64-linux-gnu
        # https://github.com/0day-ci/linux/commit/c35a3474b6782645ff0695db1f30be0e8123c131
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Matteo-Croce/riscv-optimized-mem-functions/20210617-232934
        git checkout c35a3474b6782645ff0695db1f30be0e8123c131
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=riscv 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> arch/riscv/lib/string.c:90:57: warning: attribute declaration must precede definition [-Wignored-attributes]
   void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy);
                                                           ^
   include/linux/compiler_attributes.h:291:56: note: expanded from macro '__weak'
   #define __weak                          __attribute__((__weak__))
                                                          ^
   include/linux/fortify-string.h:178:24: note: previous definition is here
   __FORTIFY_INLINE void *memcpy(void *p, const void *q, __kernel_size_t size)
                          ^
   1 warning generated.

Kconfig warnings: (for reference only)
   WARNING: unmet direct dependencies detected for LOCKDEP
   Depends on DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT && (FRAME_POINTER || MIPS || PPC || S390 || MICROBLAZE || ARM || ARC || X86)
   Selected by
   - PROVE_LOCKING && DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
   - LOCK_STAT && DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT
   - DEBUG_LOCK_ALLOC && DEBUG_KERNEL && LOCK_DEBUGGING_SUPPORT


vim +90 arch/riscv/lib/string.c

    89	
  > 90	void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy);

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Christoph Hellwig June 21, 2021, 2:26 p.m. UTC | #2
On Thu, Jun 17, 2021 at 05:27:52PM +0200, Matteo Croce wrote:
> +extern void *memcpy(void *dest, const void *src, size_t count);
> +extern void *__memcpy(void *dest, const void *src, size_t count);

No need for externs.

> +++ b/arch/riscv/lib/string.c

Nothing in her looks RISC-V specific.  Why doesn't this go into lib/ so
that other architectures can use it as well.

> +#include <linux/module.h>

I think you only need export.h.

> +void *__memcpy(void *dest, const void *src, size_t count)
> +{
> +	const int bytes_long = BITS_PER_LONG / 8;
> +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> +	const int mask = bytes_long - 1;
> +	const int distance = (src - dest) & mask;
> +#endif
> +	union const_types s = { .u8 = src };
> +	union types d = { .u8 = dest };
> +
> +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> +	if (count < MIN_THRESHOLD)

Using IS_ENABLED we can avoid a lot of the mess in this
function.

	int distance = 0;

	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
		if (count < MIN_THRESHOLD)
			goto copy_remainder;

		/* copy a byte at time until destination is aligned */
		for (; count && d.uptr & mask; count--)
			*d.u8++ = *s.u8++;
		distance = (src - dest) & mask;
	}

	if (distance) {
		...

> +		/* 32/64 bit wide copy from s to d.
> +		 * d is aligned now but s is not, so read s alignment wise,
> +		 * and do proper shift to get the right value.
> +		 * Works only on Little Endian machines.
> +		 */

Normal kernel comment style always start with a:

		/*


> +		for (next = s.ulong[0]; count >= bytes_long + mask; count -= bytes_long) {

Please avoid the pointlessly overlong line.  And (just as a matter of
personal preference) I find for loop that don't actually use a single
iterator rather confusing.  Wjy not simply:

		next = s.ulong[0];
		while (count >= bytes_long + mask) {
			...
			count -= bytes_long;
		}
Nick Kossifidis June 22, 2021, 12:14 a.m. UTC | #3
Hello Matteo and thanks for the patch,

Στις 2021-06-17 18:27, Matteo Croce έγραψε:
> 
> @@ -0,0 +1,91 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * String functions optimized for hardware which doesn't
> + * handle unaligned memory accesses efficiently.
> + *
> + * Copyright (C) 2021 Matteo Croce
> + */
> +
> +#include <linux/types.h>
> +#include <linux/module.h>
> +
> +/* Minimum size for a word copy to be convenient */
> +#define MIN_THRESHOLD (BITS_PER_LONG / 8 * 2)
> +
> +/* convenience union to avoid cast between different pointer types */
> +union types {
> +	u8 *u8;

You are using a type as a name, I'd go with as_bytes/as_ulong/as_uptr 
which makes it easier for the reader to understand what you are trying 
to do.

> +	unsigned long *ulong;
> +	uintptr_t uptr;
> +};
> +
> +union const_types {
> +	const u8 *u8;
> +	unsigned long *ulong;
> +};
> +

I suggest you define those unions inside the function body, no one else 
is using them.

> +void *__memcpy(void *dest, const void *src, size_t count)
> +{
> +	const int bytes_long = BITS_PER_LONG / 8;
> +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> +	const int mask = bytes_long - 1;
> +	const int distance = (src - dest) & mask;

Why not unsigned ints ?

> +#endif
> +	union const_types s = { .u8 = src };
> +	union types d = { .u8 = dest };
> +
> +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS

If you want to be compliant with memcpy you should check for overlapping 
regions here since "The memory areas must not overlap", and do nothing 
about it because according to POSIX this leads to undefined behavior. 
That's why recent libc implementations use memmove in any case (memcpy 
is an alias to memmove), which is the suggested approach.

> +	if (count < MIN_THRESHOLD)
> +		goto copy_remainder;
> +
> +	/* copy a byte at time until destination is aligned */
> +	for (; count && d.uptr & mask; count--)
> +		*d.u8++ = *s.u8++;
> +

You should check for !IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) here.

> +	if (distance) {
> +		unsigned long last, next;
> +
> +		/* move s backward to the previous alignment boundary */
> +		s.u8 -= distance;

It'd help here to explain that since s is distance bytes ahead relative 
to d, and d reached the alignment boundary above, s is now aligned but 
the data needs to be shifted to compensate for distance, in order to do 
word-by-word copy.

> +
> +		/* 32/64 bit wide copy from s to d.
> +		 * d is aligned now but s is not, so read s alignment wise,
> +		 * and do proper shift to get the right value.
> +		 * Works only on Little Endian machines.
> +		 */

This commend is misleading because s is aligned or else s.ulong[0]/[1] 
below would result an unaligned access.

> +		for (next = s.ulong[0]; count >= bytes_long + mask; count -= 
> bytes_long) {
> +			last = next;
> +			next = s.ulong[1];
> +
> +			d.ulong[0] = last >> (distance * 8) |
> +				     next << ((bytes_long - distance) * 8);
> +
> +			d.ulong++;
> +			s.ulong++;
> +		}
> +
> +		/* restore s with the original offset */
> +		s.u8 += distance;
> +	} else
> +#endif
> +	{
> +		/* if the source and dest lower bits are the same, do a simple
> +		 * 32/64 bit wide copy.
> +		 */

A while() loop would make more sense here.

> +		for (; count >= bytes_long; count -= bytes_long)
> +			*d.ulong++ = *s.ulong++;
> +	}
> +
> +	/* suppress warning when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y */
> +	goto copy_remainder;
> +
> +copy_remainder:
> +	while (count--)
> +		*d.u8++ = *s.u8++;
> +
> +	return dest;
> +}
> +EXPORT_SYMBOL(__memcpy);
> +
> +void *memcpy(void *dest, const void *src, size_t count) __weak
> __alias(__memcpy);
> +EXPORT_SYMBOL(memcpy);
David Laight June 22, 2021, 8:19 a.m. UTC | #4
From: Christoph Hellwig
> Sent: 21 June 2021 15:27
...
> > +		for (next = s.ulong[0]; count >= bytes_long + mask; count -= bytes_long) {
> 
> Please avoid the pointlessly overlong line.  And (just as a matter of
> personal preference) I find for loop that don't actually use a single
> iterator rather confusing.  Wjy not simply:
> 
> 		next = s.ulong[0];
> 		while (count >= bytes_long + mask) {
> 			...
> 			count -= bytes_long;
> 		}

My fist attack on long 'for' statements is just to move the
initialisation to the previous line.
Then make sure there is nothing in the comparison that needs
to be calculated every iteration.
I suspect you can subtract 'mask' from 'count'.
Giving:
		count -= mask;
		next = s.ulong[0];
		for (;; count > bytes_long; count -= bytes_long) {

Next is to shorten the variable names!

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Matteo Croce June 22, 2021, 10 p.m. UTC | #5
On Mon, Jun 21, 2021 at 4:26 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Thu, Jun 17, 2021 at 05:27:52PM +0200, Matteo Croce wrote:
> > +extern void *memcpy(void *dest, const void *src, size_t count);
> > +extern void *__memcpy(void *dest, const void *src, size_t count);
>
> No need for externs.
>

Right.

> > +++ b/arch/riscv/lib/string.c
>
> Nothing in her looks RISC-V specific.  Why doesn't this go into lib/ so
> that other architectures can use it as well.
>

Technically it could go into lib/ and be generic.
If you think it's worth it, I have just to handle the different
left/right shift because of endianness.

> > +#include <linux/module.h>
>
> I think you only need export.h.
>

Nice.

> > +void *__memcpy(void *dest, const void *src, size_t count)
> > +{
> > +     const int bytes_long = BITS_PER_LONG / 8;
> > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > +     const int mask = bytes_long - 1;
> > +     const int distance = (src - dest) & mask;
> > +#endif
> > +     union const_types s = { .u8 = src };
> > +     union types d = { .u8 = dest };
> > +
> > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > +     if (count < MIN_THRESHOLD)
>
> Using IS_ENABLED we can avoid a lot of the mess in this
> function.
>
>         int distance = 0;
>
>         if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
>                 if (count < MIN_THRESHOLD)
>                         goto copy_remainder;
>
>                 /* copy a byte at time until destination is aligned */
>                 for (; count && d.uptr & mask; count--)
>                         *d.u8++ = *s.u8++;
>                 distance = (src - dest) & mask;
>         }
>

Cool. What about putting this check in the very start:

        if (count < MIN_THRESHOLD)
                goto copy_remainder;

And since count is at least twice bytes_long, remove count from the check below?
Also, setting distance after d is aligned is as simple as getting the
lower bits of s:

        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) {
                /* Copy a byte at time until destination is aligned. */
                for (; d.uptr & mask; count--)
                        *d.u8++ = *s.u8++;

                distance = s.uptr & mask;
        }

>         if (distance) {
>                 ...
>
> > +             /* 32/64 bit wide copy from s to d.
> > +              * d is aligned now but s is not, so read s alignment wise,
> > +              * and do proper shift to get the right value.
> > +              * Works only on Little Endian machines.
> > +              */
>
> Normal kernel comment style always start with a:
>

Right, I was used to netdev ones :)

>                 /*
>
>
> > +             for (next = s.ulong[0]; count >= bytes_long + mask; count -= bytes_long) {
>
> Please avoid the pointlessly overlong line.  And (just as a matter of
> personal preference) I find for loop that don't actually use a single
> iterator rather confusing.  Wjy not simply:
>
>                 next = s.ulong[0];
>                 while (count >= bytes_long + mask) {
>                         ...
>                         count -= bytes_long;
>                 }

My fault, in a previous version it was:

    next = s.ulong[0];
    for (; count >= bytes_long + mask; count -= bytes_long) {

So to have a single `count` counter for the loop.

Regards,
Matteo Croce June 22, 2021, 10:53 p.m. UTC | #6
On Tue, Jun 22, 2021 at 10:19 AM David Laight <David.Laight@aculab.com> wrote:
>
> From: Christoph Hellwig
> > Sent: 21 June 2021 15:27
> ...
> > > +           for (next = s.ulong[0]; count >= bytes_long + mask; count -= bytes_long) {
> >
> > Please avoid the pointlessly overlong line.  And (just as a matter of
> > personal preference) I find for loop that don't actually use a single
> > iterator rather confusing.  Wjy not simply:
> >
> >               next = s.ulong[0];
> >               while (count >= bytes_long + mask) {
> >                       ...
> >                       count -= bytes_long;
> >               }
>
> My fist attack on long 'for' statements is just to move the
> initialisation to the previous line.
> Then make sure there is nothing in the comparison that needs
> to be calculated every iteration.
> I suspect you can subtract 'mask' from 'count'.
> Giving:
>                 count -= mask;
>                 next = s.ulong[0];
>                 for (;; count > bytes_long; count -= bytes_long) {
>

This way we'll lose the remainder, as count is used at the end to copy
the leftover.
Anyway, both bytes_long and mask are constant, I doubt they get summed
at every cycle.

> Next is to shorten the variable names!
>
>         David
>
Matteo Croce June 22, 2021, 11:35 p.m. UTC | #7
On Tue, Jun 22, 2021 at 2:15 AM Nick Kossifidis <mick@ics.forth.gr> wrote:
>
> Hello Matteo and thanks for the patch,
>
> Στις 2021-06-17 18:27, Matteo Croce έγραψε:
> >
> > @@ -0,0 +1,91 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * String functions optimized for hardware which doesn't
> > + * handle unaligned memory accesses efficiently.
> > + *
> > + * Copyright (C) 2021 Matteo Croce
> > + */
> > +
> > +#include <linux/types.h>
> > +#include <linux/module.h>
> > +
> > +/* Minimum size for a word copy to be convenient */
> > +#define MIN_THRESHOLD (BITS_PER_LONG / 8 * 2)
> > +
> > +/* convenience union to avoid cast between different pointer types */
> > +union types {
> > +     u8 *u8;
>
> You are using a type as a name, I'd go with as_bytes/as_ulong/as_uptr
> which makes it easier for the reader to understand what you are trying
> to do.
>

Makes sense

> > +     unsigned long *ulong;
> > +     uintptr_t uptr;
> > +};
> > +
> > +union const_types {
> > +     const u8 *u8;
> > +     unsigned long *ulong;
> > +};
> > +
>
> I suggest you define those unions inside the function body, no one else
> is using them.
>

They will be used in memset(), in patch 3/3

> > +void *__memcpy(void *dest, const void *src, size_t count)
> > +{
> > +     const int bytes_long = BITS_PER_LONG / 8;
> > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> > +     const int mask = bytes_long - 1;
> > +     const int distance = (src - dest) & mask;
>
> Why not unsigned ints ?
>

Ok.

> > +#endif
> > +     union const_types s = { .u8 = src };
> > +     union types d = { .u8 = dest };
> > +
> > +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
>
> If you want to be compliant with memcpy you should check for overlapping
> regions here since "The memory areas must not overlap", and do nothing
> about it because according to POSIX this leads to undefined behavior.
> That's why recent libc implementations use memmove in any case (memcpy
> is an alias to memmove), which is the suggested approach.
>

Mmm which memcpy arch implementation does this check?
I guess that noone is currently doing it.

> > +     if (count < MIN_THRESHOLD)
> > +             goto copy_remainder;
> > +
> > +     /* copy a byte at time until destination is aligned */
> > +     for (; count && d.uptr & mask; count--)
> > +             *d.u8++ = *s.u8++;
> > +
>
> You should check for !IS_ENABLED(CONFIG_CPU_BIG_ENDIAN) here.
>

I tought that only Little Endian RISC-V machines were supported in Linux.
Should I add a BUILD_BUG_ON()?
Anyway, if this is going in generic lib/, I'll take care of the endianness.

> > +     if (distance) {
> > +             unsigned long last, next;
> > +
> > +             /* move s backward to the previous alignment boundary */
> > +             s.u8 -= distance;
>
> It'd help here to explain that since s is distance bytes ahead relative
> to d, and d reached the alignment boundary above, s is now aligned but
> the data needs to be shifted to compensate for distance, in order to do
> word-by-word copy.
>
> > +
> > +             /* 32/64 bit wide copy from s to d.
> > +              * d is aligned now but s is not, so read s alignment wise,
> > +              * and do proper shift to get the right value.
> > +              * Works only on Little Endian machines.
> > +              */
>
> This commend is misleading because s is aligned or else s.ulong[0]/[1]
> below would result an unaligned access.
>

Yes, those two comments should be rephrased, merged and put above.

> > +             for (next = s.ulong[0]; count >= bytes_long + mask; count -=
> > bytes_long) {
> > +                     last = next;
> > +                     next = s.ulong[1];
> > +
> > +                     d.ulong[0] = last >> (distance * 8) |
> > +                                  next << ((bytes_long - distance) * 8);
> > +
> > +                     d.ulong++;
> > +                     s.ulong++;
> > +             }
> > +
> > +             /* restore s with the original offset */
> > +             s.u8 += distance;
> > +     } else
> > +#endif
> > +     {
> > +             /* if the source and dest lower bits are the same, do a simple
> > +              * 32/64 bit wide copy.
> > +              */
>
> A while() loop would make more sense here.
>

Ok.

> > +             for (; count >= bytes_long; count -= bytes_long)
> > +                     *d.ulong++ = *s.ulong++;
> > +     }
> > +
> > +     /* suppress warning when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y */
> > +     goto copy_remainder;
> > +
> > +copy_remainder:
> > +     while (count--)
> > +             *d.u8++ = *s.u8++;
> > +
> > +     return dest;
> > +}
> > +EXPORT_SYMBOL(__memcpy);
> > +
> > +void *memcpy(void *dest, const void *src, size_t count) __weak
> > __alias(__memcpy);
> > +EXPORT_SYMBOL(memcpy);

Regards,
Nick Kossifidis June 23, 2021, 9:48 a.m. UTC | #8
Στις 2021-06-23 02:35, Matteo Croce έγραψε:
>> 
>> If you want to be compliant with memcpy you should check for 
>> overlapping
>> regions here since "The memory areas must not overlap", and do nothing
>> about it because according to POSIX this leads to undefined behavior.
>> That's why recent libc implementations use memmove in any case (memcpy
>> is an alias to memmove), which is the suggested approach.
>> 
> 
> Mmm which memcpy arch implementation does this check?
> I guess that noone is currently doing it.
> 

Yup because even if they did the wouldn't know what to do about it since 
POSIX leaves this as undefined behavior. So instead of using memcpy it's 
suggested that people use memmove that can handle overlapping regions, 
and because we can't patch the rest of the kernel to only use memmove 
(or the rest of the programs if we were a libc), the idea is to just 
alias memcpy to memmove (BTW Torvalds also thinks this is a good idea: 
https://bugzilla.redhat.com/show_bug.cgi?id=638477#c132).
diff mbox series

Patch

diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
index 909049366555..6b5d6fc3eab4 100644
--- a/arch/riscv/include/asm/string.h
+++ b/arch/riscv/include/asm/string.h
@@ -12,9 +12,13 @@ 
 #define __HAVE_ARCH_MEMSET
 extern asmlinkage void *memset(void *, int, size_t);
 extern asmlinkage void *__memset(void *, int, size_t);
+
+#ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE
 #define __HAVE_ARCH_MEMCPY
-extern asmlinkage void *memcpy(void *, const void *, size_t);
-extern asmlinkage void *__memcpy(void *, const void *, size_t);
+extern void *memcpy(void *dest, const void *src, size_t count);
+extern void *__memcpy(void *dest, const void *src, size_t count);
+#endif
+
 #define __HAVE_ARCH_MEMMOVE
 extern asmlinkage void *memmove(void *, const void *, size_t);
 extern asmlinkage void *__memmove(void *, const void *, size_t);
diff --git a/arch/riscv/kernel/riscv_ksyms.c b/arch/riscv/kernel/riscv_ksyms.c
index 5ab1c7e1a6ed..3f6d512a5b97 100644
--- a/arch/riscv/kernel/riscv_ksyms.c
+++ b/arch/riscv/kernel/riscv_ksyms.c
@@ -10,8 +10,6 @@ 
  * Assembly functions that may be used (directly or indirectly) by modules
  */
 EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(__memset);
-EXPORT_SYMBOL(__memcpy);
 EXPORT_SYMBOL(__memmove);
diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
index 25d5c9664e57..2ffe85d4baee 100644
--- a/arch/riscv/lib/Makefile
+++ b/arch/riscv/lib/Makefile
@@ -1,9 +1,9 @@ 
 # SPDX-License-Identifier: GPL-2.0-only
 lib-y			+= delay.o
-lib-y			+= memcpy.o
 lib-y			+= memset.o
 lib-y			+= memmove.o
 lib-$(CONFIG_MMU)	+= uaccess.o
 lib-$(CONFIG_64BIT)	+= tishift.o
+lib-$(CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE) += string.o
 
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
diff --git a/arch/riscv/lib/memcpy.S b/arch/riscv/lib/memcpy.S
deleted file mode 100644
index 51ab716253fa..000000000000
--- a/arch/riscv/lib/memcpy.S
+++ /dev/null
@@ -1,108 +0,0 @@ 
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2013 Regents of the University of California
- */
-
-#include <linux/linkage.h>
-#include <asm/asm.h>
-
-/* void *memcpy(void *, const void *, size_t) */
-ENTRY(__memcpy)
-WEAK(memcpy)
-	move t6, a0  /* Preserve return value */
-
-	/* Defer to byte-oriented copy for small sizes */
-	sltiu a3, a2, 128
-	bnez a3, 4f
-	/* Use word-oriented copy only if low-order bits match */
-	andi a3, t6, SZREG-1
-	andi a4, a1, SZREG-1
-	bne a3, a4, 4f
-
-	beqz a3, 2f  /* Skip if already aligned */
-	/*
-	 * Round to nearest double word-aligned address
-	 * greater than or equal to start address
-	 */
-	andi a3, a1, ~(SZREG-1)
-	addi a3, a3, SZREG
-	/* Handle initial misalignment */
-	sub a4, a3, a1
-1:
-	lb a5, 0(a1)
-	addi a1, a1, 1
-	sb a5, 0(t6)
-	addi t6, t6, 1
-	bltu a1, a3, 1b
-	sub a2, a2, a4  /* Update count */
-
-2:
-	andi a4, a2, ~((16*SZREG)-1)
-	beqz a4, 4f
-	add a3, a1, a4
-3:
-	REG_L a4,       0(a1)
-	REG_L a5,   SZREG(a1)
-	REG_L a6, 2*SZREG(a1)
-	REG_L a7, 3*SZREG(a1)
-	REG_L t0, 4*SZREG(a1)
-	REG_L t1, 5*SZREG(a1)
-	REG_L t2, 6*SZREG(a1)
-	REG_L t3, 7*SZREG(a1)
-	REG_L t4, 8*SZREG(a1)
-	REG_L t5, 9*SZREG(a1)
-	REG_S a4,       0(t6)
-	REG_S a5,   SZREG(t6)
-	REG_S a6, 2*SZREG(t6)
-	REG_S a7, 3*SZREG(t6)
-	REG_S t0, 4*SZREG(t6)
-	REG_S t1, 5*SZREG(t6)
-	REG_S t2, 6*SZREG(t6)
-	REG_S t3, 7*SZREG(t6)
-	REG_S t4, 8*SZREG(t6)
-	REG_S t5, 9*SZREG(t6)
-	REG_L a4, 10*SZREG(a1)
-	REG_L a5, 11*SZREG(a1)
-	REG_L a6, 12*SZREG(a1)
-	REG_L a7, 13*SZREG(a1)
-	REG_L t0, 14*SZREG(a1)
-	REG_L t1, 15*SZREG(a1)
-	addi a1, a1, 16*SZREG
-	REG_S a4, 10*SZREG(t6)
-	REG_S a5, 11*SZREG(t6)
-	REG_S a6, 12*SZREG(t6)
-	REG_S a7, 13*SZREG(t6)
-	REG_S t0, 14*SZREG(t6)
-	REG_S t1, 15*SZREG(t6)
-	addi t6, t6, 16*SZREG
-	bltu a1, a3, 3b
-	andi a2, a2, (16*SZREG)-1  /* Update count */
-
-4:
-	/* Handle trailing misalignment */
-	beqz a2, 6f
-	add a3, a1, a2
-
-	/* Use word-oriented copy if co-aligned to word boundary */
-	or a5, a1, t6
-	or a5, a5, a3
-	andi a5, a5, 3
-	bnez a5, 5f
-7:
-	lw a4, 0(a1)
-	addi a1, a1, 4
-	sw a4, 0(t6)
-	addi t6, t6, 4
-	bltu a1, a3, 7b
-
-	ret
-
-5:
-	lb a4, 0(a1)
-	addi a1, a1, 1
-	sb a4, 0(t6)
-	addi t6, t6, 1
-	bltu a1, a3, 5b
-6:
-	ret
-END(__memcpy)
diff --git a/arch/riscv/lib/string.c b/arch/riscv/lib/string.c
new file mode 100644
index 000000000000..e48a79a045d8
--- /dev/null
+++ b/arch/riscv/lib/string.c
@@ -0,0 +1,91 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * String functions optimized for hardware which doesn't
+ * handle unaligned memory accesses efficiently.
+ *
+ * Copyright (C) 2021 Matteo Croce
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+
+/* Minimum size for a word copy to be convenient */
+#define MIN_THRESHOLD (BITS_PER_LONG / 8 * 2)
+
+/* convenience union to avoid cast between different pointer types */
+union types {
+	u8 *u8;
+	unsigned long *ulong;
+	uintptr_t uptr;
+};
+
+union const_types {
+	const u8 *u8;
+	unsigned long *ulong;
+};
+
+void *__memcpy(void *dest, const void *src, size_t count)
+{
+	const int bytes_long = BITS_PER_LONG / 8;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	const int mask = bytes_long - 1;
+	const int distance = (src - dest) & mask;
+#endif
+	union const_types s = { .u8 = src };
+	union types d = { .u8 = dest };
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (count < MIN_THRESHOLD)
+		goto copy_remainder;
+
+	/* copy a byte at time until destination is aligned */
+	for (; count && d.uptr & mask; count--)
+		*d.u8++ = *s.u8++;
+
+	if (distance) {
+		unsigned long last, next;
+
+		/* move s backward to the previous alignment boundary */
+		s.u8 -= distance;
+
+		/* 32/64 bit wide copy from s to d.
+		 * d is aligned now but s is not, so read s alignment wise,
+		 * and do proper shift to get the right value.
+		 * Works only on Little Endian machines.
+		 */
+		for (next = s.ulong[0]; count >= bytes_long + mask; count -= bytes_long) {
+			last = next;
+			next = s.ulong[1];
+
+			d.ulong[0] = last >> (distance * 8) |
+				     next << ((bytes_long - distance) * 8);
+
+			d.ulong++;
+			s.ulong++;
+		}
+
+		/* restore s with the original offset */
+		s.u8 += distance;
+	} else
+#endif
+	{
+		/* if the source and dest lower bits are the same, do a simple
+		 * 32/64 bit wide copy.
+		 */
+		for (; count >= bytes_long; count -= bytes_long)
+			*d.ulong++ = *s.ulong++;
+	}
+
+	/* suppress warning when CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y */
+	goto copy_remainder;
+
+copy_remainder:
+	while (count--)
+		*d.u8++ = *s.u8++;
+
+	return dest;
+}
+EXPORT_SYMBOL(__memcpy);
+
+void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy);
+EXPORT_SYMBOL(memcpy);