diff mbox series

[1/1] riscv: prevent pipeline stall in __asm_to/copy_from_user

Message ID CACuRN0Pd8VFTz55qzXvJeqOEt2ZGi--j1wDyqnAt=q_42ES++w@mail.gmail.com (mailing list archive)
State New, archived
Headers show
Series riscv: better network performance with memcpy, uaccess | expand

Commit Message

Akira Tsukamoto June 4, 2021, 9:56 a.m. UTC
Reducing pipeline stall of read after write (RAW).

These are the results from combination of the speedup with
Gary's misalign fix. Speeds up from 680Mbps to 900Mbps.

Before applying these two patches.
---
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
Accepted connection from 192.168.1.153, port 45972
[  5] local 192.168.1.112 port 5201 connected to 192.168.1.153 port 45974
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-1.00   sec  80.8 MBytes   678 Mbits/sec
[  5]   1.00-2.00   sec  82.1 MBytes   689 Mbits/sec
[  5]   2.00-3.00   sec  82.1 MBytes   688 Mbits/sec
[  5]   3.00-4.00   sec  81.7 MBytes   685 Mbits/sec
[  5]   4.00-5.00   sec  82.1 MBytes   689 Mbits/sec
[  5]   5.00-6.00   sec  82.0 MBytes   687 Mbits/sec
[  5]   6.00-7.00   sec  82.4 MBytes   691 Mbits/sec
[  5]   7.00-8.00   sec  82.2 MBytes   689 Mbits/sec
[  5]   8.00-9.00   sec  82.2 MBytes   690 Mbits/sec
[  5]   9.00-10.00  sec  82.2 MBytes   690 Mbits/sec
[  5]  10.00-10.01  sec   486 KBytes   682 Mbits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-10.01  sec   820 MBytes   688 Mbits/sec                  receiver
-----------------------------------------------------------
---

Afer.
---
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
Accepted connection from 192.168.1.153, port 44612
[  5] local 192.168.1.112 port 5201 connected to 192.168.1.153 port 44614
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-1.00   sec   105 MBytes   879 Mbits/sec
[  5]   1.00-2.00   sec   108 MBytes   904 Mbits/sec
[  5]   2.00-3.00   sec   107 MBytes   901 Mbits/sec
[  5]   3.00-4.00   sec   108 MBytes   902 Mbits/sec
[  5]   4.00-5.00   sec   108 MBytes   906 Mbits/sec
[  5]   5.00-6.00   sec   107 MBytes   900 Mbits/sec
[  5]   6.00-7.00   sec   108 MBytes   906 Mbits/sec
[  5]   7.00-8.00   sec   108 MBytes   904 Mbits/sec
[  5]   8.00-9.00   sec   108 MBytes   902 Mbits/sec
[  5]   9.00-10.00  sec   108 MBytes   905 Mbits/sec
[  5]  10.00-10.01  sec   612 KBytes   902 Mbits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-10.01  sec  1.05 GBytes   901 Mbits/sec                  receiver
-----------------------------------------------------------
---

Signed-off-by: Akira Tsukamoto <akira.tsukamoto@gmail.com>
---
 arch/riscv/lib/uaccess.S | 106 +++++++++++++++++++++++++++------------
 1 file changed, 73 insertions(+), 33 deletions(-)

Comments

David Laight June 8, 2021, 11:31 a.m. UTC | #1
From: Akira Tsukamoto
> Sent: 04 June 2021 10:57
> 
> Reducing pipeline stall of read after write (RAW).
> 
> These are the results from combination of the speedup with
> Gary's misalign fix. Speeds up from 680Mbps to 900Mbps.
> 
> Before applying these two patches.

I think the changes should be in separate patches.
Otherwise it is difficult to see what is relevant.
It also looks as if there is a register rename.
Maybe that should be a precursor patch?
...

I think this is the old main copy loop:
>  1:
> -    fixup REG_L, t2, (a1), 10f
> -    fixup REG_S, t2, (a0), 10f
> -    addi a1, a1, SZREG
> -    addi a0, a0, SZREG
> -    bltu a1, t1, 1b
and this is the new one:
>  3:
> +    fixup REG_L a4,       0(a1), 10f
> +    fixup REG_L a5,   SZREG(a1), 10f
> +    fixup REG_L a6, 2*SZREG(a1), 10f
> +    fixup REG_L a7, 3*SZREG(a1), 10f
> +    fixup REG_L t0, 4*SZREG(a1), 10f
> +    fixup REG_L t1, 5*SZREG(a1), 10f
> +    fixup REG_L t2, 6*SZREG(a1), 10f
> +    fixup REG_L t3, 7*SZREG(a1), 10f
> +    fixup REG_S a4,       0(t5), 10f
> +    fixup REG_S a5,   SZREG(t5), 10f
> +    fixup REG_S a6, 2*SZREG(t5), 10f
> +    fixup REG_S a7, 3*SZREG(t5), 10f
> +    fixup REG_S t0, 4*SZREG(t5), 10f
> +    fixup REG_S t1, 5*SZREG(t5), 10f
> +    fixup REG_S t2, 6*SZREG(t5), 10f
> +    fixup REG_S t3, 7*SZREG(t5), 10f
> +    addi a1, a1, 8*SZREG
> +    addi t5, t5, 8*SZREG
> +    bltu a1, a3, 3b

I don't know the architecture, but unless there is a stunning
pipeline delay for memory reads a simple interleaved copy
may be fast enough.
So something like:
	a = src[0];
	do {
		b = src[1];
		src += 2;
		dst[0] = a;
		dst += 2;
		a = src[0];
		dst[-1] = b;
	} while (src != src_end);
	dst[0] = a;

It is probably worth doing benchmarks of the copy loop
in userspace.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Palmer Dabbelt June 12, 2021, 4:05 a.m. UTC | #2
On Tue, 08 Jun 2021 04:31:40 PDT (-0700), David.Laight@ACULAB.COM wrote:
> From: Akira Tsukamoto
>> Sent: 04 June 2021 10:57
>> 
>> Reducing pipeline stall of read after write (RAW).
>> 
>> These are the results from combination of the speedup with
>> Gary's misalign fix. Speeds up from 680Mbps to 900Mbps.
>> 
>> Before applying these two patches.
> 
> I think the changes should be in separate patches.
> Otherwise it is difficult to see what is relevant.
> It also looks as if there is a register rename.
> Maybe that should be a precursor patch?

Yes, and I'd also prefer the original patches.  This also doesn't apply.

> ...
> 
> I think this is the old main copy loop:
>>  1:
>> -    fixup REG_L, t2, (a1), 10f
>> -    fixup REG_S, t2, (a0), 10f
>> -    addi a1, a1, SZREG
>> -    addi a0, a0, SZREG
>> -    bltu a1, t1, 1b
> and this is the new one:
>>  3:
>> +    fixup REG_L a4,       0(a1), 10f
>> +    fixup REG_L a5,   SZREG(a1), 10f
>> +    fixup REG_L a6, 2*SZREG(a1), 10f
>> +    fixup REG_L a7, 3*SZREG(a1), 10f
>> +    fixup REG_L t0, 4*SZREG(a1), 10f
>> +    fixup REG_L t1, 5*SZREG(a1), 10f
>> +    fixup REG_L t2, 6*SZREG(a1), 10f
>> +    fixup REG_L t3, 7*SZREG(a1), 10f
>> +    fixup REG_S a4,       0(t5), 10f
>> +    fixup REG_S a5,   SZREG(t5), 10f
>> +    fixup REG_S a6, 2*SZREG(t5), 10f
>> +    fixup REG_S a7, 3*SZREG(t5), 10f
>> +    fixup REG_S t0, 4*SZREG(t5), 10f
>> +    fixup REG_S t1, 5*SZREG(t5), 10f
>> +    fixup REG_S t2, 6*SZREG(t5), 10f
>> +    fixup REG_S t3, 7*SZREG(t5), 10f
>> +    addi a1, a1, 8*SZREG
>> +    addi t5, t5, 8*SZREG
>> +    bltu a1, a3, 3b
> 
> I don't know the architecture, but unless there is a stunning
> pipeline delay for memory reads a simple interleaved copy
> may be fast enough.
> So something like:
> 	a = src[0];
> 	do {
> 		b = src[1];
> 		src += 2;
> 		dst[0] = a;
> 		dst += 2;
> 		a = src[0];
> 		dst[-1] = b;
> 	} while (src != src_end);
> 	dst[0] = a;
> 
> It is probably worth doing benchmarks of the copy loop
> in userspace.

I also don't know this microarchitecture, but this seems like a pretty 
wacky load-use delay.

Can we split out the misaligned handling fix to get that in sooner, 
that's likely the more urgent issue.

> 
> 	David
> 
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
David Laight June 12, 2021, 12:17 p.m. UTC | #3
From: Palmer Dabbelt
> Sent: 12 June 2021 05:05
...
> > I don't know the architecture, but unless there is a stunning
> > pipeline delay for memory reads a simple interleaved copy
> > may be fast enough.
> > So something like:
> > 	a = src[0];
> > 	do {
> > 		b = src[1];
> > 		src += 2;
> > 		dst[0] = a;
> > 		dst += 2;
> > 		a = src[0];
> > 		dst[-1] = b;
> > 	} while (src != src_end);
> > 	dst[0] = a;
> >
> > It is probably worth doing benchmarks of the copy loop
> > in userspace.
> 
> I also don't know this microarchitecture, but this seems like a pretty
> wacky load-use delay.

It is quite sane really.

While many cpu can use the result of the ALU in the next clock
(there is typically special logic to bypass the write to the
register file) this isn't always true for memory (cache) reads.
It may even be that the read itself takes more than one cycle
(probably pipelined so they can happen every cycle).

So a simple '*dest = *src' copy loop suffers the 'memory read'
penalty ever iteration.
At out-of-order execution unit that uses register renames
(like most x86) will just defer the writes until the data
is available - so isn't impacted.

Interleaving the reads and writes means you issue a read
while waiting for the value from the previous read to
get to the register file - and be available for the
write instruction.

Moving the 'src/dst += 2' into the loop gives a reasonable
chance that they are executed in parallel with a memory
access (on in-order superscaler cpu) rather than bunching
them up at the end where the start adding clocks.

If your cpu can only do one memory read or one memory write
per clock then you only need it to execute two instructions
per clock for the loop above to run at maximum speed.
Even with a 'read latency' of two clocks.
(Especially since riscv has 'mips like' 'compare and branch'
instructions that probably execute in 1 clock when predicted
taken.)

If the cpu can do a read and a write in one clock then the
loop may still run at the maximum speed.
For this to happen you do need he read data to be available
next clock and to run load, store, add and compare instructions
in a single clock.
Without that much parallelism it might be necessary to add
an extra read/write interleave (an maybe a 4th to avoid a
divide by three).

The 'elephant in the room' is a potential additional stall
on reads if the previous cycle is a write to the same cache area.
For instance the nios2 (a soft cpu for altera fpga) can do
back to back reads or back to back writes, but since the reads
are done speculatively (regardless of the opcode!) they have to
be deferred when a write is using the memory block.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Akira Tsukamoto June 16, 2021, 10:08 a.m. UTC | #4
On Sat, Jun 12, 2021 at 1:05 PM Palmer Dabbelt <palmer@dabbelt.com> wrote:
>
> On Tue, 08 Jun 2021 04:31:40 PDT (-0700), David.Laight@ACULAB.COM wrote:
> > From: Akira Tsukamoto
> >> Sent: 04 June 2021 10:57
> >>
> >> Reducing pipeline stall of read after write (RAW).
> >>
> >> These are the results from combination of the speedup with
> >> Gary's misalign fix. Speeds up from 680Mbps to 900Mbps.
> >>
> >> Before applying these two patches.
> >
> > I think the changes should be in separate patches.
> > Otherwise it is difficult to see what is relevant.
> > It also looks as if there is a register rename.
> > Maybe that should be a precursor patch?
>
> Yes, and I'd also prefer the original patches.  This also doesn't apply.
>
> > ...
> >
> > I think this is the old main copy loop:
> >>  1:
> >> -    fixup REG_L, t2, (a1), 10f
> >> -    fixup REG_S, t2, (a0), 10f
> >> -    addi a1, a1, SZREG
> >> -    addi a0, a0, SZREG
> >> -    bltu a1, t1, 1b
> > and this is the new one:
> >>  3:
> >> +    fixup REG_L a4,       0(a1), 10f
> >> +    fixup REG_L a5,   SZREG(a1), 10f
> >> +    fixup REG_L a6, 2*SZREG(a1), 10f
> >> +    fixup REG_L a7, 3*SZREG(a1), 10f
> >> +    fixup REG_L t0, 4*SZREG(a1), 10f
> >> +    fixup REG_L t1, 5*SZREG(a1), 10f
> >> +    fixup REG_L t2, 6*SZREG(a1), 10f
> >> +    fixup REG_L t3, 7*SZREG(a1), 10f
> >> +    fixup REG_S a4,       0(t5), 10f
> >> +    fixup REG_S a5,   SZREG(t5), 10f
> >> +    fixup REG_S a6, 2*SZREG(t5), 10f
> >> +    fixup REG_S a7, 3*SZREG(t5), 10f
> >> +    fixup REG_S t0, 4*SZREG(t5), 10f
> >> +    fixup REG_S t1, 5*SZREG(t5), 10f
> >> +    fixup REG_S t2, 6*SZREG(t5), 10f
> >> +    fixup REG_S t3, 7*SZREG(t5), 10f
> >> +    addi a1, a1, 8*SZREG
> >> +    addi t5, t5, 8*SZREG
> >> +    bltu a1, a3, 3b
> >
> > I don't know the architecture, but unless there is a stunning
> > pipeline delay for memory reads a simple interleaved copy
> > may be fast enough.
> > So something like:
> >       a = src[0];
> >       do {
> >               b = src[1];
> >               src += 2;
> >               dst[0] = a;
> >               dst += 2;
> >               a = src[0];
> >               dst[-1] = b;
> >       } while (src != src_end);
> >       dst[0] = a;
> >
> > It is probably worth doing benchmarks of the copy loop
> > in userspace.
>
> I also don't know this microarchitecture, but this seems like a pretty
> wacky load-use delay.
>
> Can we split out the misaligned handling fix to get that in sooner,
> that's likely the more urgent issue.

Sure, just give me a few days.

Akira

>
> >
> >       David
> >
> > -
> > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> > Registration No: 1397386 (Wales)
Akira Tsukamoto June 16, 2021, 10:24 a.m. UTC | #5
On Sat, Jun 12, 2021 at 9:17 PM David Laight <David.Laight@aculab.com> wrote:
>
> From: Palmer Dabbelt
> > Sent: 12 June 2021 05:05
> ...
> > > I don't know the architecture, but unless there is a stunning
> > > pipeline delay for memory reads a simple interleaved copy
> > > may be fast enough.
> > > So something like:
> > >     a = src[0];
> > >     do {
> > >             b = src[1];
> > >             src += 2;
> > >             dst[0] = a;
> > >             dst += 2;
> > >             a = src[0];
> > >             dst[-1] = b;
> > >     } while (src != src_end);
> > >     dst[0] = a;
> > >
> > > It is probably worth doing benchmarks of the copy loop
> > > in userspace.
> >
> > I also don't know this microarchitecture, but this seems like a pretty
> > wacky load-use delay.
>
> It is quite sane really.
>
> While many cpu can use the result of the ALU in the next clock
> (there is typically special logic to bypass the write to the
> register file) this isn't always true for memory (cache) reads.
> It may even be that the read itself takes more than one cycle
> (probably pipelined so they can happen every cycle).
>
> So a simple '*dest = *src' copy loop suffers the 'memory read'
> penalty ever iteration.
> At out-of-order execution unit that uses register renames
> (like most x86) will just defer the writes until the data
> is available - so isn't impacted.
>
> Interleaving the reads and writes means you issue a read
> while waiting for the value from the previous read to
> get to the register file - and be available for the
> write instruction.
>
> Moving the 'src/dst += 2' into the loop gives a reasonable
> chance that they are executed in parallel with a memory
> access (on in-order superscaler cpu) rather than bunching
> them up at the end where the start adding clocks.
>
> If your cpu can only do one memory read or one memory write
> per clock then you only need it to execute two instructions
> per clock for the loop above to run at maximum speed.
> Even with a 'read latency' of two clocks.
> (Especially since riscv has 'mips like' 'compare and branch'
> instructions that probably execute in 1 clock when predicted
> taken.)
>
> If the cpu can do a read and a write in one clock then the
> loop may still run at the maximum speed.
> For this to happen you do need he read data to be available
> next clock and to run load, store, add and compare instructions
> in a single clock.
> Without that much parallelism it might be necessary to add
> an extra read/write interleave (an maybe a 4th to avoid a
> divide by three).

It is becoming like a computer architecture discussion, I agree with David's
simple interleaved copy would speed up with the same hardware
reason.
I used to get this kind of confirmation from cpu designers when they were
working on the same floor.
I am fine either way. I used the simple unrolling just because all other
existing copy functions for riscv and other cpus do the same.

I am lazy of porting C version interlive memcpy to assembly.
I wrote in the cover letter for using assembler inside uaccess.S is
because the
__asm_to/copy_from_user() handling page fault must be done manually inside
the functions.

Akira


>
> The 'elephant in the room' is a potential additional stall
> on reads if the previous cycle is a write to the same cache area.
> For instance the nios2 (a soft cpu for altera fpga) can do
> back to back reads or back to back writes, but since the reads
> are done speculatively (regardless of the opcode!) they have to
> be deferred when a write is using the memory block.
>
>         David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)
diff mbox series

Patch

diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S
index fceaeb18cc64..2528a77709e1 100644
--- a/arch/riscv/lib/uaccess.S
+++ b/arch/riscv/lib/uaccess.S
@@ -19,50 +19,90 @@  ENTRY(__asm_copy_from_user)
     li t6, SR_SUM
     csrs CSR_STATUS, t6

-    add a3, a1, a2
+    move t5, a0  /* Preserve return value */
+
+    /* Defer to byte-oriented copy for small sizes */
+    sltiu a3, a2, 64
+    bnez a3, 4f
     /* Use word-oriented copy only if low-order bits match */
-    andi t0, a0, SZREG-1
-    andi t1, a1, SZREG-1
-    bne t0, t1, 2f
+    andi a3, t5, SZREG-1
+    andi a4, a1, SZREG-1
+    bne a3, a4, 4f

-    addi t0, a1, SZREG-1
-    andi t1, a3, ~(SZREG-1)
-    andi t0, t0, ~(SZREG-1)
+    beqz a3, 2f  /* Skip if already aligned */
     /*
-     * a3: terminal address of source region
-     * t0: lowest XLEN-aligned address in source
-     * t1: highest XLEN-aligned address in source
+     * Round to nearest double word-aligned address
+     * greater than or equal to start address
      */
-    bgeu t0, t1, 2f
-    bltu a1, t0, 4f
+    andi a3, a1, ~(SZREG-1)
+    addi a3, a3, SZREG
+    /* Handle initial misalignment */
+    sub a4, a3, a1
 1:
-    fixup REG_L, t2, (a1), 10f
-    fixup REG_S, t2, (a0), 10f
-    addi a1, a1, SZREG
-    addi a0, a0, SZREG
-    bltu a1, t1, 1b
-2:
-    bltu a1, a3, 5f
+    lb a5, 0(a1)
+    addi a1, a1, 1
+    sb a5, 0(t5)
+    addi t5, t5, 1
+    bltu a1, a3, 1b
+    sub a2, a2, a4  /* Update count */

+2:
+    andi a4, a2, ~((8*SZREG)-1)
+    beqz a4, 4f
+    add a3, a1, a4
 3:
+    fixup REG_L a4,       0(a1), 10f
+    fixup REG_L a5,   SZREG(a1), 10f
+    fixup REG_L a6, 2*SZREG(a1), 10f
+    fixup REG_L a7, 3*SZREG(a1), 10f
+    fixup REG_L t0, 4*SZREG(a1), 10f
+    fixup REG_L t1, 5*SZREG(a1), 10f
+    fixup REG_L t2, 6*SZREG(a1), 10f
+    fixup REG_L t3, 7*SZREG(a1), 10f
+    fixup REG_S a4,       0(t5), 10f
+    fixup REG_S a5,   SZREG(t5), 10f
+    fixup REG_S a6, 2*SZREG(t5), 10f
+    fixup REG_S a7, 3*SZREG(t5), 10f
+    fixup REG_S t0, 4*SZREG(t5), 10f
+    fixup REG_S t1, 5*SZREG(t5), 10f
+    fixup REG_S t2, 6*SZREG(t5), 10f
+    fixup REG_S t3, 7*SZREG(t5), 10f
+    addi a1, a1, 8*SZREG
+    addi t5, t5, 8*SZREG
+    bltu a1, a3, 3b
+    andi a2, a2, (8*SZREG)-1  /* Update count */
+
+4:
+    /* Handle trailing misalignment */
+    beqz a2, 6f
+    add a3, a1, a2
+
+    /* Use word-oriented copy if co-aligned to word boundary */
+    or a5, a1, t5
+    or a5, a5, a3
+    andi a5, a5, 3
+    bnez a5, 5f
+7:
+    fixup lw a4, 0(a1), 10f
+    addi a1, a1, 4
+    fixup sw a4, 0(t5), 10f
+    addi t5, t5, 4
+    bltu a1, a3, 7b
+
+    j 6f
+
+5:
+    fixup lb a4, 0(a1), 10f
+    addi a1, a1, 1
+    fixup sb a4, 0(t5), 10f
+    addi t5, t5, 1
+    bltu a1, a3, 5b
+
+6:
     /* Disable access to user memory */
     csrc CSR_STATUS, t6
     li a0, 0
     ret
-4: /* Edge case: unalignment */
-    fixup lbu, t2, (a1), 10f
-    fixup sb, t2, (a0), 10f
-    addi a1, a1, 1
-    addi a0, a0, 1
-    bltu a1, t0, 4b
-    j 1b
-5: /* Edge case: remainder */
-    fixup lbu, t2, (a1), 10f
-    fixup sb, t2, (a0), 10f
-    addi a1, a1, 1
-    addi a0, a0, 1
-    bltu a1, a3, 5b
-    j 3b
 ENDPROC(__asm_copy_to_user)
 ENDPROC(__asm_copy_from_user)
 EXPORT_SYMBOL(__asm_copy_to_user)