diff mbox series

arm64: enable GENERIC_FIND_FIRST_BIT

Message ID 20201205165406.108990-1-yury.norov@gmail.com (mailing list archive)
State New, archived
Headers show
Series arm64: enable GENERIC_FIND_FIRST_BIT | expand

Commit Message

Yury Norov Dec. 5, 2020, 4:54 p.m. UTC
ARM64 doesn't implement find_first_{zero}_bit in arch code and doesn't
enable it in config. It leads to using find_next_bit() which is less
efficient:

0000000000000000 <find_first_bit>:
   0:	aa0003e4 	mov	x4, x0
   4:	aa0103e0 	mov	x0, x1
   8:	b4000181 	cbz	x1, 38 <find_first_bit+0x38>
   c:	f9400083 	ldr	x3, [x4]
  10:	d2800802 	mov	x2, #0x40                  	// #64
  14:	91002084 	add	x4, x4, #0x8
  18:	b40000c3 	cbz	x3, 30 <find_first_bit+0x30>
  1c:	14000008 	b	3c <find_first_bit+0x3c>
  20:	f8408483 	ldr	x3, [x4], #8
  24:	91010045 	add	x5, x2, #0x40
  28:	b50000c3 	cbnz	x3, 40 <find_first_bit+0x40>
  2c:	aa0503e2 	mov	x2, x5
  30:	eb02001f 	cmp	x0, x2
  34:	54ffff68 	b.hi	20 <find_first_bit+0x20>  // b.pmore
  38:	d65f03c0 	ret
  3c:	d2800002 	mov	x2, #0x0                   	// #0
  40:	dac00063 	rbit	x3, x3
  44:	dac01063 	clz	x3, x3
  48:	8b020062 	add	x2, x3, x2
  4c:	eb02001f 	cmp	x0, x2
  50:	9a829000 	csel	x0, x0, x2, ls  // ls = plast
  54:	d65f03c0 	ret

  ...

0000000000000118 <_find_next_bit.constprop.1>:
 118:	eb02007f 	cmp	x3, x2
 11c:	540002e2 	b.cs	178 <_find_next_bit.constprop.1+0x60>  // b.hs, b.nlast
 120:	d346fc66 	lsr	x6, x3, #6
 124:	f8667805 	ldr	x5, [x0, x6, lsl #3]
 128:	b4000061 	cbz	x1, 134 <_find_next_bit.constprop.1+0x1c>
 12c:	f8667826 	ldr	x6, [x1, x6, lsl #3]
 130:	8a0600a5 	and	x5, x5, x6
 134:	ca0400a6 	eor	x6, x5, x4
 138:	92800005 	mov	x5, #0xffffffffffffffff    	// #-1
 13c:	9ac320a5 	lsl	x5, x5, x3
 140:	927ae463 	and	x3, x3, #0xffffffffffffffc0
 144:	ea0600a5 	ands	x5, x5, x6
 148:	54000120 	b.eq	16c <_find_next_bit.constprop.1+0x54>  // b.none
 14c:	1400000e 	b	184 <_find_next_bit.constprop.1+0x6c>
 150:	d346fc66 	lsr	x6, x3, #6
 154:	f8667805 	ldr	x5, [x0, x6, lsl #3]
 158:	b4000061 	cbz	x1, 164 <_find_next_bit.constprop.1+0x4c>
 15c:	f8667826 	ldr	x6, [x1, x6, lsl #3]
 160:	8a0600a5 	and	x5, x5, x6
 164:	eb05009f 	cmp	x4, x5
 168:	540000c1 	b.ne	180 <_find_next_bit.constprop.1+0x68>  // b.any
 16c:	91010063 	add	x3, x3, #0x40
 170:	eb03005f 	cmp	x2, x3
 174:	54fffee8 	b.hi	150 <_find_next_bit.constprop.1+0x38>  // b.pmore
 178:	aa0203e0 	mov	x0, x2
 17c:	d65f03c0 	ret
 180:	ca050085 	eor	x5, x4, x5
 184:	dac000a5 	rbit	x5, x5
 188:	dac010a5 	clz	x5, x5
 18c:	8b0300a3 	add	x3, x5, x3
 190:	eb03005f 	cmp	x2, x3
 194:	9a839042 	csel	x2, x2, x3, ls  // ls = plast
 198:	aa0203e0 	mov	x0, x2
 19c:	d65f03c0 	ret

 ...

0000000000000238 <find_next_bit>:
 238:	a9bf7bfd 	stp	x29, x30, [sp, #-16]!
 23c:	aa0203e3 	mov	x3, x2
 240:	d2800004 	mov	x4, #0x0                   	// #0
 244:	aa0103e2 	mov	x2, x1
 248:	910003fd 	mov	x29, sp
 24c:	d2800001 	mov	x1, #0x0                   	// #0
 250:	97ffffb2 	bl	118 <_find_next_bit.constprop.1>
 254:	a8c17bfd 	ldp	x29, x30, [sp], #16
 258:	d65f03c0 	ret

Enabling this functions would also benefit for_each_{set,clear}_bit().
Would it make sense to enable this config for all such architectures by
default?

Signed-off-by: Yury Norov <yury.norov@gmail.com>

---
 arch/arm64/Kconfig | 1 +
 1 file changed, 1 insertion(+)

Comments

Will Deacon Dec. 7, 2020, 11:25 a.m. UTC | #1
On Sat, Dec 05, 2020 at 08:54:06AM -0800, Yury Norov wrote:
> ARM64 doesn't implement find_first_{zero}_bit in arch code and doesn't
> enable it in config. It leads to using find_next_bit() which is less
> efficient:

[...]

> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 1515f6f153a0..2b90ef1f548e 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -106,6 +106,7 @@ config ARM64
>  	select GENERIC_CPU_AUTOPROBE
>  	select GENERIC_CPU_VULNERABILITIES
>  	select GENERIC_EARLY_IOREMAP
> +	select GENERIC_FIND_FIRST_BIT

Does this actually make any measurable difference? The disassembly with
or without this is _very_ similar for me (clang 11).

Will
Yury Norov Dec. 8, 2020, 1:59 a.m. UTC | #2
(CC: Alexey Klimov)

On Mon, Dec 7, 2020 at 3:25 AM Will Deacon <will@kernel.org> wrote:
>
> On Sat, Dec 05, 2020 at 08:54:06AM -0800, Yury Norov wrote:
> > ARM64 doesn't implement find_first_{zero}_bit in arch code and doesn't
> > enable it in config. It leads to using find_next_bit() which is less
> > efficient:
>
> [...]
>
> > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > index 1515f6f153a0..2b90ef1f548e 100644
> > --- a/arch/arm64/Kconfig
> > +++ b/arch/arm64/Kconfig
> > @@ -106,6 +106,7 @@ config ARM64
> >       select GENERIC_CPU_AUTOPROBE
> >       select GENERIC_CPU_VULNERABILITIES
> >       select GENERIC_EARLY_IOREMAP
> > +     select GENERIC_FIND_FIRST_BIT
>
> Does this actually make any measurable difference? The disassembly with
> or without this is _very_ similar for me (clang 11).
>
> Will

On A-53 find_first_bit() is almost twice faster than find_next_bit(),
according to
lib/find_bit_benchmark. (Thanks to Alexey for testing.)

Yury

---

Tested-by: Alexey Klimov <aklimov@redhat.com>

Start testing find_bit() with random-filled bitmap
[7126084.864616] find_next_bit:                 9653351 ns, 164280 iterations
[7126084.881146] find_next_zero_bit:            9591974 ns, 163401 iterations
[7126084.893859] find_last_bit:                 5778627 ns, 164280 iterations
[7126084.948181] find_first_bit:               47389224 ns,  16357 iterations
[7126084.958975] find_next_and_bit:             3875849 ns,  73487 iterations
[7126084.965884]
                 Start testing find_bit() with sparse bitmap
[7126084.973474] find_next_bit:                  109879 ns,    655 iterations
[7126084.999365] find_next_zero_bit:           18968440 ns, 327026 iterations
[7126085.006351] find_last_bit:                   80503 ns,    655 iterations
[7126085.032315] find_first_bit:               19048193 ns,    655 iterations
[7126085.039303] find_next_and_bit:               82628 ns,      1 iterations

with enabled GENERIC_FIND_FIRST_BIT:

               Start testing find_bit() with random-filled bitmap
[   84.095335] find_next_bit:                 9600970 ns, 163770 iterations
[   84.111695] find_next_zero_bit:            9613137 ns, 163911 iterations
[   84.124143] find_last_bit:                 5713907 ns, 163770 iterations
[   84.158068] find_first_bit:               27193319 ns,  16406 iterations
[   84.168663] find_next_and_bit:             3863814 ns,  73671 iterations
[   84.175392]
               Start testing find_bit() with sparse bitmap
[   84.182660] find_next_bit:                  112334 ns,    656 iterations
[   84.208375] find_next_zero_bit:           18976981 ns, 327025 iterations
[   84.215184] find_last_bit:                   79584 ns,    656 iterations
[   84.233005] find_first_bit:               11082437 ns,    656 iterations
[   84.239821] find_next_and_bit:               82209 ns,      1 iterations

root@pine:~# cpupower -c all frequency-info | grep asserted
  current CPU frequency: 648 MHz (asserted by call to hardware)
  current CPU frequency: 648 MHz (asserted by call to hardware)
  current CPU frequency: 648 MHz (asserted by call to hardware)
  current CPU frequency: 648 MHz (asserted by call to hardware)
root@pine:~# lscpu
Architecture:                    aarch64
CPU op-mode(s):                  32-bit, 64-bit
Byte Order:                      Little Endian
CPU(s):                          4
On-line CPU(s) list:             0-3
Thread(s) per core:              1
Core(s) per socket:              4
Socket(s):                       1
Vendor ID:                       ARM
Model:                           4
Model name:                      Cortex-A53
Stepping:                        r0p4
CPU max MHz:                     1152.0000
CPU min MHz:                     648.0000
BogoMIPS:                        48.00
Vulnerability Itlb multihit:     Not affected
Vulnerability L1tf:              Not affected
Vulnerability Mds:               Not affected
Vulnerability Meltdown:          Not affected
Vulnerability Spec store bypass: Not affected
Vulnerability Spectre v1:        Mitigation; __user pointer sanitization
Vulnerability Spectre v2:        Not affected
Vulnerability Srbds:             Not affected
Vulnerability Tsx async abort:   Not affected
Flags:                           fp asimd evtstrm aes pmull sha1 sha2
crc32 cpuid
Will Deacon Dec. 8, 2020, 10:35 a.m. UTC | #3
On Mon, Dec 07, 2020 at 05:59:16PM -0800, Yury Norov wrote:
> (CC: Alexey Klimov)
> 
> On Mon, Dec 7, 2020 at 3:25 AM Will Deacon <will@kernel.org> wrote:
> >
> > On Sat, Dec 05, 2020 at 08:54:06AM -0800, Yury Norov wrote:
> > > ARM64 doesn't implement find_first_{zero}_bit in arch code and doesn't
> > > enable it in config. It leads to using find_next_bit() which is less
> > > efficient:
> >
> > [...]
> >
> > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> > > index 1515f6f153a0..2b90ef1f548e 100644
> > > --- a/arch/arm64/Kconfig
> > > +++ b/arch/arm64/Kconfig
> > > @@ -106,6 +106,7 @@ config ARM64
> > >       select GENERIC_CPU_AUTOPROBE
> > >       select GENERIC_CPU_VULNERABILITIES
> > >       select GENERIC_EARLY_IOREMAP
> > > +     select GENERIC_FIND_FIRST_BIT
> >
> > Does this actually make any measurable difference? The disassembly with
> > or without this is _very_ similar for me (clang 11).
> >
> > Will
> 
> On A-53 find_first_bit() is almost twice faster than find_next_bit(),
> according to
> lib/find_bit_benchmark. (Thanks to Alexey for testing.)

I guess it's more compiler dependent than anything else, and it's a pity
that find_next_bit() isn't implemented in terms of the generic
find_first_bit() tbh, but if the numbers are as you suggest then I don't
have a problem selecting this on arm64.

Will
diff mbox series

Patch

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1515f6f153a0..2b90ef1f548e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -106,6 +106,7 @@  config ARM64
 	select GENERIC_CPU_AUTOPROBE
 	select GENERIC_CPU_VULNERABILITIES
 	select GENERIC_EARLY_IOREMAP
+	select GENERIC_FIND_FIRST_BIT
 	select GENERIC_IDLE_POLL_SETUP
 	select GENERIC_IRQ_IPI
 	select GENERIC_IRQ_MULTI_HANDLER