diff mbox series

[v3,1/1] riscv: __asm_copy_to-from_user: Optimize unaligned memory access and pipeline stall

Message ID 60c1f087-1e8b-8f22-7d25-86f5f3dcee3f@gmail.com (mailing list archive)
State New, archived
Headers show
Series riscv: improving uaccess with logs from network bench | expand

Commit Message

Akira Tsukamoto June 23, 2021, 12:40 p.m. UTC
This patch will reduce cpu usage dramatically in kernel space especially
for application which use sys-call with large buffer size, such as network
applications. The main reason behind this is that every unaligned memory
access will raise exceptions and switch between s-mode and m-mode causing
large overhead.

First copy in bytes until reaches the first word aligned boundary in
destination memory address. This is the preparation before the bulk
aligned word copy.

The destination address is aligned now, but oftentimes the source address
is not in an aligned boundary. To reduce the unaligned memory access, it
reads the data from source in aligned boundaries, which will cause the
data to have an offset, and then combines the data in the next iteration
by fixing offset with shifting before writing to destination. The majority
of the improving copy speed comes from this shift copy.

In the lucky situation that the both source and destination address are on
the aligned boundary, perform load and store with register size to copy the
data. Without the unrolling, it will reduce the speed since the next store
instruction for the same register using from the load will stall the
pipeline.

At last, copying the remainder in one byte at a time.

Signed-off-by: Akira Tsukamoto <akira.tsukamoto@gmail.com>
---
 arch/riscv/lib/uaccess.S | 181 +++++++++++++++++++++++++++++++--------
 1 file changed, 146 insertions(+), 35 deletions(-)

Comments

Palmer Dabbelt July 6, 2021, 11:16 p.m. UTC | #1
On Wed, 23 Jun 2021 05:40:39 PDT (-0700), akira.tsukamoto@gmail.com wrote:
>
> This patch will reduce cpu usage dramatically in kernel space especially
> for application which use sys-call with large buffer size, such as network
> applications. The main reason behind this is that every unaligned memory
> access will raise exceptions and switch between s-mode and m-mode causing
> large overhead.
>
> First copy in bytes until reaches the first word aligned boundary in
> destination memory address. This is the preparation before the bulk
> aligned word copy.
>
> The destination address is aligned now, but oftentimes the source address
> is not in an aligned boundary. To reduce the unaligned memory access, it
> reads the data from source in aligned boundaries, which will cause the
> data to have an offset, and then combines the data in the next iteration
> by fixing offset with shifting before writing to destination. The majority
> of the improving copy speed comes from this shift copy.
>
> In the lucky situation that the both source and destination address are on
> the aligned boundary, perform load and store with register size to copy the
> data. Without the unrolling, it will reduce the speed since the next store
> instruction for the same register using from the load will stall the
> pipeline.
>
> At last, copying the remainder in one byte at a time.
>
> Signed-off-by: Akira Tsukamoto <akira.tsukamoto@gmail.com>
> ---
>  arch/riscv/lib/uaccess.S | 181 +++++++++++++++++++++++++++++++--------
>  1 file changed, 146 insertions(+), 35 deletions(-)
>
> diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S
> index fceaeb18cc64..bceb0629e440 100644
> --- a/arch/riscv/lib/uaccess.S
> +++ b/arch/riscv/lib/uaccess.S
> @@ -19,50 +19,161 @@ ENTRY(__asm_copy_from_user)
>  	li t6, SR_SUM
>  	csrs CSR_STATUS, t6
>
> -	add a3, a1, a2
> -	/* Use word-oriented copy only if low-order bits match */
> -	andi t0, a0, SZREG-1
> -	andi t1, a1, SZREG-1
> -	bne t0, t1, 2f
> +	/* Save for return value */
> +	mv	t5, a2
>
> -	addi t0, a1, SZREG-1
> -	andi t1, a3, ~(SZREG-1)
> -	andi t0, t0, ~(SZREG-1)
>  	/*
> -	 * a3: terminal address of source region
> -	 * t0: lowest XLEN-aligned address in source
> -	 * t1: highest XLEN-aligned address in source
> +	 * Register allocation for code below:
> +	 * a0 - start of uncopied dst
> +	 * a1 - start of uncopied src
> +	 * a2 - size
> +	 * t0 - end of uncopied dst
>  	 */
> -	bgeu t0, t1, 2f
> -	bltu a1, t0, 4f
> +	add	t0, a0, a2
> +	bgtu	a0, t0, 5f
> +
> +	/*
> +	 * Use byte copy only if too small.
> +	 */
> +	li	a3, 8*SZREG /* size must be larger than size in word_copy */
> +	bltu	a2, a3, .Lbyte_copy_tail
> +
> +	/*
> +	 * Copy first bytes until dst is align to word boundary.
> +	 * a0 - start of dst
> +	 * t1 - start of aligned dst
> +	 */
> +	addi	t1, a0, SZREG-1
> +	andi	t1, t1, ~(SZREG-1)
> +	/* dst is already aligned, skip */
> +	beq	a0, t1, .Lskip_first_bytes
>  1:
> -	fixup REG_L, t2, (a1), 10f
> -	fixup REG_S, t2, (a0), 10f
> -	addi a1, a1, SZREG
> -	addi a0, a0, SZREG
> -	bltu a1, t1, 1b
> +	/* a5 - one byte for copying data */
> +	fixup lb      a5, 0(a1), 10f
> +	addi	a1, a1, 1	/* src */
> +	fixup sb      a5, 0(a0), 10f
> +	addi	a0, a0, 1	/* dst */
> +	bltu	a0, t1, 1b	/* t1 - start of aligned dst */
> +
> +.Lskip_first_bytes:
> +	/*
> +	 * Now dst is aligned.
> +	 * Use shift-copy if src is misaligned.
> +	 * Use word-copy if both src and dst are aligned because
> +	 * can not use shift-copy which do not require shifting
> +	 */
> +	/* a1 - start of src */
> +	andi	a3, a1, SZREG-1
> +	bnez	a3, .Lshift_copy
> +
> +.Lword_copy:
> +        /*
> +	 * Both src and dst are aligned, unrolled word copy
> +	 *
> +	 * a0 - start of aligned dst
> +	 * a1 - start of aligned src
> +	 * a3 - a1 & mask:(SZREG-1)
> +	 * t0 - end of aligned dst
> +	 */
> +	addi	t0, t0, -(8*SZREG-1) /* not to over run */
>  2:
> -	bltu a1, a3, 5f
> +	fixup REG_L   a4,        0(a1), 10f
> +	fixup REG_L   a5,    SZREG(a1), 10f
> +	fixup REG_L   a6,  2*SZREG(a1), 10f
> +	fixup REG_L   a7,  3*SZREG(a1), 10f
> +	fixup REG_L   t1,  4*SZREG(a1), 10f
> +	fixup REG_L   t2,  5*SZREG(a1), 10f
> +	fixup REG_L   t3,  6*SZREG(a1), 10f
> +	fixup REG_L   t4,  7*SZREG(a1), 10f
> +	fixup REG_S   a4,        0(a0), 10f
> +	fixup REG_S   a5,    SZREG(a0), 10f
> +	fixup REG_S   a6,  2*SZREG(a0), 10f
> +	fixup REG_S   a7,  3*SZREG(a0), 10f
> +	fixup REG_S   t1,  4*SZREG(a0), 10f
> +	fixup REG_S   t2,  5*SZREG(a0), 10f
> +	fixup REG_S   t3,  6*SZREG(a0), 10f
> +	fixup REG_S   t4,  7*SZREG(a0), 10f

This seems like a suspiciously large unrolling factor, at least without 
a fallback.  My guess is that some workloads will want some smaller 
unrolling factors, but given that we run on these single-issue in-order 
processors it's probably best to have some big unrolling factors as well 
since they're pretty limited WRT integer bandwidth.

> +	addi	a0, a0, 8*SZREG
> +	addi	a1, a1, 8*SZREG
> +	bltu	a0, t0, 2b
> +
> +	addi	t0, t0, 8*SZREG-1 /* revert to original value */
> +	j	.Lbyte_copy_tail
> +
> +.Lshift_copy:
> +
> +	/*
> +	 * Word copy with shifting.
> +	 * For misaligned copy we still perform aligned word copy, but
> +	 * we need to use the value fetched from the previous iteration and
> +	 * do some shifts.
> +	 * This is safe because reading less than a word size.
> +	 *
> +	 * a0 - start of aligned dst
> +	 * a1 - start of src
> +	 * a3 - a1 & mask:(SZREG-1)
> +	 * t0 - end of uncopied dst
> +	 * t1 - end of aligned dst
> +	 */
> +	/* calculating aligned word boundary for dst */
> +	andi	t1, t0, ~(SZREG-1)
> +	/* Converting unaligned src to aligned arc */
> +	andi	a1, a1, ~(SZREG-1)
> +
> +	/*
> +	 * Calculate shifts
> +	 * t3 - prev shift
> +	 * t4 - current shift
> +	 */
> +	slli	t3, a3, LGREG
> +	li	a5, SZREG*8
> +	sub	t4, a5, t3
> +
> +	/* Load the first word to combine with seceond word */
> +	fixup REG_L   a5, 0(a1), 10f
>
>  3:
> +	/* Main shifting copy
> +	 *
> +	 * a0 - start of aligned dst
> +	 * a1 - start of aligned src
> +	 * t1 - end of aligned dst
> +	 */
> +
> +	/* At least one iteration will be executed */
> +	srl	a4, a5, t3
> +	fixup REG_L   a5, SZREG(a1), 10f
> +	addi	a1, a1, SZREG
> +	sll	a2, a5, t4
> +	or	a2, a2, a4
> +	fixup REG_S   a2, 0(a0), 10f
> +	addi	a0, a0, SZREG
> +	bltu	a0, t1, 3b
> +
> +	/* Revert src to original unaligned value  */
> +	add	a1, a1, a3
> +
> +.Lbyte_copy_tail:
> +	/*
> +	 * Byte copy anything left.
> +	 *
> +	 * a0 - start of remaining dst
> +	 * a1 - start of remaining src
> +	 * t0 - end of remaining dst
> +	 */
> +	bgeu	a0, t0, 5f
> +4:
> +	fixup lb      a5, 0(a1), 10f
> +	addi	a1, a1, 1	/* src */
> +	fixup sb      a5, 0(a0), 10f
> +	addi	a0, a0, 1	/* dst */
> +	bltu	a0, t0, 4b	/* t0 - end of dst */
> +
> +5:
>  	/* Disable access to user memory */
>  	csrc CSR_STATUS, t6
> -	li a0, 0
> +	li	a0, 0
>  	ret
> -4: /* Edge case: unalignment */
> -	fixup lbu, t2, (a1), 10f
> -	fixup sb, t2, (a0), 10f
> -	addi a1, a1, 1
> -	addi a0, a0, 1
> -	bltu a1, t0, 4b
> -	j 1b
> -5: /* Edge case: remainder */
> -	fixup lbu, t2, (a1), 10f
> -	fixup sb, t2, (a0), 10f
> -	addi a1, a1, 1
> -	addi a0, a0, 1
> -	bltu a1, a3, 5b
> -	j 3b
>  ENDPROC(__asm_copy_to_user)
>  ENDPROC(__asm_copy_from_user)
>  EXPORT_SYMBOL(__asm_copy_to_user)
> @@ -117,7 +228,7 @@ EXPORT_SYMBOL(__clear_user)
>  10:
>  	/* Disable access to user memory */
>  	csrs CSR_STATUS, t6
> -	mv a0, a2
> +	mv a0, t5
>  	ret
>  11:
>  	csrs CSR_STATUS, t6

That said, this is good enough for me.  If someone comes up with a case 
where the extra unrolling is an issue I'm happy to take something to fix 
it, but until then I'm fine with this as-is.  Like the string fuctions 
it's probably best to eventually put this in C, but IIRC last time I 
tried it was kind of a headache.

This is on for-next.

Thanks!
David Laight July 7, 2021, 10:07 a.m. UTC | #2
...
> > +	fixup REG_L   a4,        0(a1), 10f
> > +	fixup REG_L   a5,    SZREG(a1), 10f
> > +	fixup REG_L   a6,  2*SZREG(a1), 10f
> > +	fixup REG_L   a7,  3*SZREG(a1), 10f
> > +	fixup REG_L   t1,  4*SZREG(a1), 10f
> > +	fixup REG_L   t2,  5*SZREG(a1), 10f
> > +	fixup REG_L   t3,  6*SZREG(a1), 10f
> > +	fixup REG_L   t4,  7*SZREG(a1), 10f
> > +	fixup REG_S   a4,        0(a0), 10f
> > +	fixup REG_S   a5,    SZREG(a0), 10f
> > +	fixup REG_S   a6,  2*SZREG(a0), 10f
> > +	fixup REG_S   a7,  3*SZREG(a0), 10f
> > +	fixup REG_S   t1,  4*SZREG(a0), 10f
> > +	fixup REG_S   t2,  5*SZREG(a0), 10f
> > +	fixup REG_S   t3,  6*SZREG(a0), 10f
> > +	fixup REG_S   t4,  7*SZREG(a0), 10f
> 
> This seems like a suspiciously large unrolling factor, at least without
> a fallback.  My guess is that some workloads will want some smaller
> unrolling factors, but given that we run on these single-issue in-order
> processors it's probably best to have some big unrolling factors as well
> since they're pretty limited WRT integer bandwidth.

But a single-issue cpu is unlikely to have an 8 clock data delay.
OTOH a cpu than can do concurrent memory read and write might
not have enough 'out of order' capability to do so with the above loop.

You may want to interleave the reads and writes - starting with
two or three reads (possibly with the extra ones outside the loop).

I don't know the microarchitectures well enough (well at all)
to know the exact pitfalls.

The very simple cpu might have the same 'issue' the Nios2 has
(another MIPS clone fpga soft cpu) where there can be a pipeline
stall between a write and read.
I doubt the non-trivial riscv have that issue though.

> > +	addi	a0, a0, 8*SZREG
> > +	addi	a1, a1, 8*SZREG
> > +	bltu	a0, t0, 2b

For a dual-issue cpu you want to move the two 'addi' higher
up the loop so that they are 'free'.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Guenter Roeck July 10, 2021, 1:49 a.m. UTC | #3
Hi,

On Wed, Jun 23, 2021 at 09:40:39PM +0900, Akira Tsukamoto wrote:
> This patch will reduce cpu usage dramatically in kernel space especially
> for application which use sys-call with large buffer size, such as network
> applications. The main reason behind this is that every unaligned memory
> access will raise exceptions and switch between s-mode and m-mode causing
> large overhead.
> 
> First copy in bytes until reaches the first word aligned boundary in
> destination memory address. This is the preparation before the bulk
> aligned word copy.
> 
> The destination address is aligned now, but oftentimes the source address
> is not in an aligned boundary. To reduce the unaligned memory access, it
> reads the data from source in aligned boundaries, which will cause the
> data to have an offset, and then combines the data in the next iteration
> by fixing offset with shifting before writing to destination. The majority
> of the improving copy speed comes from this shift copy.
> 
> In the lucky situation that the both source and destination address are on
> the aligned boundary, perform load and store with register size to copy the
> data. Without the unrolling, it will reduce the speed since the next store
> instruction for the same register using from the load will stall the
> pipeline.
> 
> At last, copying the remainder in one byte at a time.
> 
> Signed-off-by: Akira Tsukamoto <akira.tsukamoto@gmail.com>

This patch causes all riscv32 qemu emulations to stall during boot.
The log suggests that something in kernel/user communication may be wrong.

Bad case:

Starting syslogd: OK
Starting klogd: OK
/etc/init.d/S02sysctl: line 68: syntax error: EOF in backquote substitution
/etc/init.d/S20urandom: line 1: syntax error: unterminated quoted string
Starting network: /bin/sh: syntax error: unterminated quoted string
sed: unmatched '/'
/bin/sh: syntax error: unterminated quoted string
FAIL
/etc/init.d/S55runtest: line 48: syntax error: EOF in backquote substitution

Good case (this patch reverted):

Starting syslogd: OK
Starting klogd: OK
Running sysctl: OK
Saving random seed: [   12.277714] random: dd: uninitialized urandom read (512 bytes read)
OK
Starting network: [   12.949529] e1000: eth0 NIC Link is Up 1000 Mbps Full Duplex, Flow Control: RX
[   12.951170] IPv6: ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
udhcpc: started, v1.33.0
udhcpc: sending discover
udhcpc: sending select for 10.0.2.15
udhcpc: lease of 10.0.2.15 obtained, lease time 86400
deleting routers
adding dns 10.0.2.3
OK
Found console ttyS0

Reverting this patch fixes the problem. Bisect log attached.

Guenter

---
# bad: [50be9417e23af5a8ac860d998e1e3f06b8fd79d7] Merge tag 'io_uring-5.14-2021-07-09' of git://git.kernel.dk/linux-block
# good: [f55966571d5eb2876a11e48e798b4592fa1ffbb7] Merge tag 'drm-next-2021-07-08-1' of git://anongit.freedesktop.org/drm/drm
git bisect start 'HEAD' 'f55966571d5e'
# good: [7a400bf28334fc7734639db3566394e1fc80670c] Merge tag 'for-linus-5.14-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rw/ubifs
git bisect good 7a400bf28334fc7734639db3566394e1fc80670c
# bad: [d8dc121eeab9abfbc510097f8db83e87560f753b] Merge branch 'linus' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
git bisect bad d8dc121eeab9abfbc510097f8db83e87560f753b
# bad: [7761e36bc7222d1221242c5f195ee0fd40caea40] riscv: Fix PTDUMP output now BPF region moved back to module region
git bisect bad 7761e36bc7222d1221242c5f195ee0fd40caea40
# good: [5def4429aefe65b494816d9ba8ae7f971d522251] riscv: mm: Use better bitmap_zalloc()
git bisect good 5def4429aefe65b494816d9ba8ae7f971d522251
# good: [47513f243b452a5e21180dcf3d6ac1c57e1781a6] riscv: Enable KFENCE for riscv64
git bisect good 47513f243b452a5e21180dcf3d6ac1c57e1781a6
# good: [01112e5e20f5298a81639806cd0a3c587aade467] Merge branch 'riscv-wx-mappings' into for-next
git bisect good 01112e5e20f5298a81639806cd0a3c587aade467
# good: [70eee556b678d1e4cd4ea6742a577b596963fa25] riscv: ptrace: add argn syntax
git bisect good 70eee556b678d1e4cd4ea6742a577b596963fa25
# bad: [ca6eaaa210deec0e41cbfc380bf89cf079203569] riscv: __asm_copy_to-from_user: Optimize unaligned memory access and pipeline stall
git bisect bad ca6eaaa210deec0e41cbfc380bf89cf079203569
# good: [31da94c25aea835ceac00575a9fd206c5a833fed] riscv: add VMAP_STACK overflow detection
git bisect good 31da94c25aea835ceac00575a9fd206c5a833fed
# first bad commit: [ca6eaaa210deec0e41cbfc380bf89cf079203569] riscv: __asm_copy_to-from_user: Optimize unaligned memory access and pipeline stall
Geert Uytterhoeven July 13, 2021, 6:10 p.m. UTC | #4
Hi Günter, Tsukamoto-san,

On Sat, Jul 10, 2021 at 3:50 AM Guenter Roeck <linux@roeck-us.net> wrote:
> On Wed, Jun 23, 2021 at 09:40:39PM +0900, Akira Tsukamoto wrote:
> > This patch will reduce cpu usage dramatically in kernel space especially
> > for application which use sys-call with large buffer size, such as network
> > applications. The main reason behind this is that every unaligned memory
> > access will raise exceptions and switch between s-mode and m-mode causing
> > large overhead.
> >
> > First copy in bytes until reaches the first word aligned boundary in
> > destination memory address. This is the preparation before the bulk
> > aligned word copy.
> >
> > The destination address is aligned now, but oftentimes the source address
> > is not in an aligned boundary. To reduce the unaligned memory access, it
> > reads the data from source in aligned boundaries, which will cause the
> > data to have an offset, and then combines the data in the next iteration
> > by fixing offset with shifting before writing to destination. The majority
> > of the improving copy speed comes from this shift copy.
> >
> > In the lucky situation that the both source and destination address are on
> > the aligned boundary, perform load and store with register size to copy the
> > data. Without the unrolling, it will reduce the speed since the next store
> > instruction for the same register using from the load will stall the
> > pipeline.
> >
> > At last, copying the remainder in one byte at a time.
> >
> > Signed-off-by: Akira Tsukamoto <akira.tsukamoto@gmail.com>
>
> This patch causes all riscv32 qemu emulations to stall during boot.
> The log suggests that something in kernel/user communication may be wrong.
>
> Bad case:
>
> Starting syslogd: OK
> Starting klogd: OK
> /etc/init.d/S02sysctl: line 68: syntax error: EOF in backquote substitution
> /etc/init.d/S20urandom: line 1: syntax error: unterminated quoted string
> Starting network: /bin/sh: syntax error: unterminated quoted string

> # first bad commit: [ca6eaaa210deec0e41cbfc380bf89cf079203569] riscv: __asm_copy_to-from_user: Optimize unaligned memory access and pipeline stall

Same here on vexriscv. Bisected to the same commit.

The actual scripts look fine when using "cat", but contain some garbage
when executing them using "sh -v".

Tsukamoto-san: glancing at the patch:

+       addi    a0, a0, 8*SZREG
+       addi    a1, a1, 8*SZREG

I think you forgot about rv32, where registers cover only 4
bytes each?

Gr{oetje,eeting}s,

                        Geert
Akira Tsukamoto July 15, 2021, 6:20 a.m. UTC | #5
On 7/14/2021 3:10 AM, Geert Uytterhoeven wrote:
> Hi Günter, Tsukamoto-san,
> 
> On Sat, Jul 10, 2021 at 3:50 AM Guenter Roeck <linux@roeck-us.net> wrote:
>> On Wed, Jun 23, 2021 at 09:40:39PM +0900, Akira Tsukamoto wrote:
>>> This patch will reduce cpu usage dramatically in kernel space especially
>>> for application which use sys-call with large buffer size, such as network
>>> applications. The main reason behind this is that every unaligned memory
>>> access will raise exceptions and switch between s-mode and m-mode causing
>>> large overhead.
>>>
>>> First copy in bytes until reaches the first word aligned boundary in
>>> destination memory address. This is the preparation before the bulk
>>> aligned word copy.
>>>
>>> The destination address is aligned now, but oftentimes the source address
>>> is not in an aligned boundary. To reduce the unaligned memory access, it
>>> reads the data from source in aligned boundaries, which will cause the
>>> data to have an offset, and then combines the data in the next iteration
>>> by fixing offset with shifting before writing to destination. The majority
>>> of the improving copy speed comes from this shift copy.
>>>
>>> In the lucky situation that the both source and destination address are on
>>> the aligned boundary, perform load and store with register size to copy the
>>> data. Without the unrolling, it will reduce the speed since the next store
>>> instruction for the same register using from the load will stall the
>>> pipeline.
>>>
>>> At last, copying the remainder in one byte at a time.
>>>
>>> Signed-off-by: Akira Tsukamoto <akira.tsukamoto@gmail.com>
>>
>> This patch causes all riscv32 qemu emulations to stall during boot.
>> The log suggests that something in kernel/user communication may be wrong.
>>
>> Bad case:
>>
>> Starting syslogd: OK
>> Starting klogd: OK
>> /etc/init.d/S02sysctl: line 68: syntax error: EOF in backquote substitution
>> /etc/init.d/S20urandom: line 1: syntax error: unterminated quoted string
>> Starting network: /bin/sh: syntax error: unterminated quoted string
> 
>> # first bad commit: [ca6eaaa210deec0e41cbfc380bf89cf079203569] riscv: __asm_copy_to-from_user: Optimize unaligned memory access and pipeline stall
> 
> Same here on vexriscv. Bisected to the same commit.
> 
> The actual scripts look fine when using "cat", but contain some garbage
> when executing them using "sh -v".
> 
> Tsukamoto-san: glancing at the patch:
> 
> +       addi    a0, a0, 8*SZREG
> +       addi    a1, a1, 8*SZREG
> 
> I think you forgot about rv32, where registers cover only 4
> bytes each?

Thanks Günter and Geert for the pointing out the errors.
I will send the fixes, probably this weekend.

Akira
diff mbox series

Patch

diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S
index fceaeb18cc64..bceb0629e440 100644
--- a/arch/riscv/lib/uaccess.S
+++ b/arch/riscv/lib/uaccess.S
@@ -19,50 +19,161 @@  ENTRY(__asm_copy_from_user)
 	li t6, SR_SUM
 	csrs CSR_STATUS, t6
 
-	add a3, a1, a2
-	/* Use word-oriented copy only if low-order bits match */
-	andi t0, a0, SZREG-1
-	andi t1, a1, SZREG-1
-	bne t0, t1, 2f
+	/* Save for return value */
+	mv	t5, a2
 
-	addi t0, a1, SZREG-1
-	andi t1, a3, ~(SZREG-1)
-	andi t0, t0, ~(SZREG-1)
 	/*
-	 * a3: terminal address of source region
-	 * t0: lowest XLEN-aligned address in source
-	 * t1: highest XLEN-aligned address in source
+	 * Register allocation for code below:
+	 * a0 - start of uncopied dst
+	 * a1 - start of uncopied src
+	 * a2 - size
+	 * t0 - end of uncopied dst
 	 */
-	bgeu t0, t1, 2f
-	bltu a1, t0, 4f
+	add	t0, a0, a2
+	bgtu	a0, t0, 5f
+
+	/*
+	 * Use byte copy only if too small.
+	 */
+	li	a3, 8*SZREG /* size must be larger than size in word_copy */
+	bltu	a2, a3, .Lbyte_copy_tail
+
+	/*
+	 * Copy first bytes until dst is align to word boundary.
+	 * a0 - start of dst
+	 * t1 - start of aligned dst
+	 */
+	addi	t1, a0, SZREG-1
+	andi	t1, t1, ~(SZREG-1)
+	/* dst is already aligned, skip */
+	beq	a0, t1, .Lskip_first_bytes
 1:
-	fixup REG_L, t2, (a1), 10f
-	fixup REG_S, t2, (a0), 10f
-	addi a1, a1, SZREG
-	addi a0, a0, SZREG
-	bltu a1, t1, 1b
+	/* a5 - one byte for copying data */
+	fixup lb      a5, 0(a1), 10f
+	addi	a1, a1, 1	/* src */
+	fixup sb      a5, 0(a0), 10f
+	addi	a0, a0, 1	/* dst */
+	bltu	a0, t1, 1b	/* t1 - start of aligned dst */
+
+.Lskip_first_bytes:
+	/*
+	 * Now dst is aligned.
+	 * Use shift-copy if src is misaligned.
+	 * Use word-copy if both src and dst are aligned because
+	 * can not use shift-copy which do not require shifting
+	 */
+	/* a1 - start of src */
+	andi	a3, a1, SZREG-1
+	bnez	a3, .Lshift_copy
+
+.Lword_copy:
+        /*
+	 * Both src and dst are aligned, unrolled word copy
+	 *
+	 * a0 - start of aligned dst
+	 * a1 - start of aligned src
+	 * a3 - a1 & mask:(SZREG-1)
+	 * t0 - end of aligned dst
+	 */
+	addi	t0, t0, -(8*SZREG-1) /* not to over run */
 2:
-	bltu a1, a3, 5f
+	fixup REG_L   a4,        0(a1), 10f
+	fixup REG_L   a5,    SZREG(a1), 10f
+	fixup REG_L   a6,  2*SZREG(a1), 10f
+	fixup REG_L   a7,  3*SZREG(a1), 10f
+	fixup REG_L   t1,  4*SZREG(a1), 10f
+	fixup REG_L   t2,  5*SZREG(a1), 10f
+	fixup REG_L   t3,  6*SZREG(a1), 10f
+	fixup REG_L   t4,  7*SZREG(a1), 10f
+	fixup REG_S   a4,        0(a0), 10f
+	fixup REG_S   a5,    SZREG(a0), 10f
+	fixup REG_S   a6,  2*SZREG(a0), 10f
+	fixup REG_S   a7,  3*SZREG(a0), 10f
+	fixup REG_S   t1,  4*SZREG(a0), 10f
+	fixup REG_S   t2,  5*SZREG(a0), 10f
+	fixup REG_S   t3,  6*SZREG(a0), 10f
+	fixup REG_S   t4,  7*SZREG(a0), 10f
+	addi	a0, a0, 8*SZREG
+	addi	a1, a1, 8*SZREG
+	bltu	a0, t0, 2b
+
+	addi	t0, t0, 8*SZREG-1 /* revert to original value */
+	j	.Lbyte_copy_tail
+
+.Lshift_copy:
+
+	/*
+	 * Word copy with shifting.
+	 * For misaligned copy we still perform aligned word copy, but
+	 * we need to use the value fetched from the previous iteration and
+	 * do some shifts.
+	 * This is safe because reading less than a word size.
+	 *
+	 * a0 - start of aligned dst
+	 * a1 - start of src
+	 * a3 - a1 & mask:(SZREG-1)
+	 * t0 - end of uncopied dst
+	 * t1 - end of aligned dst
+	 */
+	/* calculating aligned word boundary for dst */
+	andi	t1, t0, ~(SZREG-1)
+	/* Converting unaligned src to aligned arc */
+	andi	a1, a1, ~(SZREG-1)
+
+	/*
+	 * Calculate shifts
+	 * t3 - prev shift
+	 * t4 - current shift
+	 */
+	slli	t3, a3, LGREG
+	li	a5, SZREG*8
+	sub	t4, a5, t3
+
+	/* Load the first word to combine with seceond word */
+	fixup REG_L   a5, 0(a1), 10f
 
 3:
+	/* Main shifting copy
+	 *
+	 * a0 - start of aligned dst
+	 * a1 - start of aligned src
+	 * t1 - end of aligned dst
+	 */
+
+	/* At least one iteration will be executed */
+	srl	a4, a5, t3
+	fixup REG_L   a5, SZREG(a1), 10f
+	addi	a1, a1, SZREG
+	sll	a2, a5, t4
+	or	a2, a2, a4
+	fixup REG_S   a2, 0(a0), 10f
+	addi	a0, a0, SZREG
+	bltu	a0, t1, 3b
+
+	/* Revert src to original unaligned value  */
+	add	a1, a1, a3
+
+.Lbyte_copy_tail:
+	/*
+	 * Byte copy anything left.
+	 *
+	 * a0 - start of remaining dst
+	 * a1 - start of remaining src
+	 * t0 - end of remaining dst
+	 */
+	bgeu	a0, t0, 5f
+4:
+	fixup lb      a5, 0(a1), 10f
+	addi	a1, a1, 1	/* src */
+	fixup sb      a5, 0(a0), 10f
+	addi	a0, a0, 1	/* dst */
+	bltu	a0, t0, 4b	/* t0 - end of dst */
+
+5:
 	/* Disable access to user memory */
 	csrc CSR_STATUS, t6
-	li a0, 0
+	li	a0, 0
 	ret
-4: /* Edge case: unalignment */
-	fixup lbu, t2, (a1), 10f
-	fixup sb, t2, (a0), 10f
-	addi a1, a1, 1
-	addi a0, a0, 1
-	bltu a1, t0, 4b
-	j 1b
-5: /* Edge case: remainder */
-	fixup lbu, t2, (a1), 10f
-	fixup sb, t2, (a0), 10f
-	addi a1, a1, 1
-	addi a0, a0, 1
-	bltu a1, a3, 5b
-	j 3b
 ENDPROC(__asm_copy_to_user)
 ENDPROC(__asm_copy_from_user)
 EXPORT_SYMBOL(__asm_copy_to_user)
@@ -117,7 +228,7 @@  EXPORT_SYMBOL(__clear_user)
 10:
 	/* Disable access to user memory */
 	csrs CSR_STATUS, t6
-	mv a0, a2
+	mv a0, t5
 	ret
 11:
 	csrs CSR_STATUS, t6