diff mbox

[V3] lib: GCD: add binary GCD algorithm

Message ID 1461843824-19853-1-git-send-email-zengzhaoxiu@163.com (mailing list archive)
State New, archived
Headers show

Commit Message

zengzhaoxiu@163.com April 28, 2016, 11:43 a.m. UTC
From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>

Because some architectures (alpha, armv6, etc.) don't provide hardware division,
the mod operation is slow! Binary GCD algorithm uses simple arithmetic operations,
it replaces division with arithmetic shifts, comparisons, and subtractions.

I have compiled successfully with x86_64_defconfig and i386_defconfig.

Changes to V2:
- Add a new Kconfig variable CPU_NO_EFFICIENT_FFS
- Separate into two versions by CPU_NO_EFFICIENT_FFS
- Return directly from the loop, rather than using break().
- Use "r &= -r" mostly because it's clearer.
- Improve a little bit in even/odd version

Changes to V1:
- Don't touch Kconfig, remove the Euclidean algorithm implementation
- Don't use the "even-odd" variant
- Use __ffs if the CPU has efficient __ffs

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: George Spelvin <linux@horizon.com>
---
 arch/Kconfig                         |  3 ++
 arch/alpha/Kconfig                   |  1 +
 arch/arm/mm/Kconfig                  |  3 ++
 arch/h8300/Kconfig                   |  1 +
 arch/m32r/Kconfig                    |  1 +
 arch/m68k/Kconfig.cpu                | 11 ++++++
 arch/metag/Kconfig                   |  1 +
 arch/microblaze/Kconfig              |  1 +
 arch/mips/include/asm/cpu-features.h |  3 ++
 arch/nios2/Kconfig                   |  1 +
 arch/openrisc/Kconfig                |  1 +
 arch/parisc/Kconfig                  |  1 +
 arch/score/Kconfig                   |  1 +
 arch/sh/Kconfig                      |  1 +
 arch/sparc/Kconfig                   |  1 +
 lib/gcd.c                            | 66 +++++++++++++++++++++++++++++++-----
 16 files changed, 88 insertions(+), 9 deletions(-)

Comments

kernel test robot April 28, 2016, 12:18 p.m. UTC | #1
Hi,

[auto build test ERROR on v4.6-rc5]
[cannot apply to next-20160428]
[if your patch is applied to the wrong git tree, please drop us a note to help improving the system]

url:    https://github.com/0day-ci/linux/commits/zengzhaoxiu-163-com/lib-GCD-add-binary-GCD-algorithm/20160428-195527
config: mips-allyesconfig (attached as .config)
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=mips 

All error/warnings (new ones prefixed by >>):

   In file included from arch/mips/include/asm/bitops.h:21:0,
                    from include/linux/bitops.h:36,
                    from include/linux/kernel.h:10,
                    from include/asm-generic/bug.h:13,
                    from arch/mips/include/asm/bug.h:41,
                    from include/linux/bug.h:4,
                    from include/linux/page-flags.h:9,
                    from kernel/bounds.c:9:
>> arch/mips/include/asm/cpu-features.h:205:28: warning: "cpu_data" is not defined [-Wundef]
    # define cpu_has_mips32r6 (cpu_data[0].isa_level & MIPS_CPU_ISA_M32R6)
                               ^
>> arch/mips/include/asm/cpu-features.h:241:5: note: in expansion of macro 'cpu_has_mips32r6'
        cpu_has_mips32r6 | cpu_has_mips64r1 | \
        ^
>> arch/mips/include/asm/cpu-features.h:289:25: note: in expansion of macro 'cpu_has_mips_r'
    #define cpu_has_clo_clz cpu_has_mips_r
                            ^
>> arch/mips/include/asm/cpu-features.h:291:6: note: in expansion of macro 'cpu_has_clo_clz'
    #if !cpu_has_clo_clz
         ^
>> arch/mips/include/asm/cpu-features.h:205:36: error: token "[" is not valid in preprocessor expressions
    # define cpu_has_mips32r6 (cpu_data[0].isa_level & MIPS_CPU_ISA_M32R6)
                                       ^
>> arch/mips/include/asm/cpu-features.h:241:5: note: in expansion of macro 'cpu_has_mips32r6'
        cpu_has_mips32r6 | cpu_has_mips64r1 | \
        ^
>> arch/mips/include/asm/cpu-features.h:289:25: note: in expansion of macro 'cpu_has_mips_r'
    #define cpu_has_clo_clz cpu_has_mips_r
                            ^
>> arch/mips/include/asm/cpu-features.h:291:6: note: in expansion of macro 'cpu_has_clo_clz'
    #if !cpu_has_clo_clz
         ^
   make[2]: *** [kernel/bounds.s] Error 1
   make[2]: Target '__build' not remade because of errors.
   make[1]: *** [prepare0] Error 2
   make[1]: Target 'prepare' not remade because of errors.
   make: *** [sub-make] Error 2

vim +205 arch/mips/include/asm/cpu-features.h

0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  199  # define cpu_has_mips32r1	(cpu_data[0].isa_level & MIPS_CPU_ISA_M32R1)
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  200  #endif
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  201  #ifndef cpu_has_mips32r2
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  202  # define cpu_has_mips32r2	(cpu_data[0].isa_level & MIPS_CPU_ISA_M32R2)
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  203  #endif
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  204  #ifndef cpu_has_mips32r6
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13 @205  # define cpu_has_mips32r6	(cpu_data[0].isa_level & MIPS_CPU_ISA_M32R6)
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  206  #endif
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  207  #ifndef cpu_has_mips64r1
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  208  # define cpu_has_mips64r1	(cpu_data[0].isa_level & MIPS_CPU_ISA_M64R1)
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  209  #endif
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  210  #ifndef cpu_has_mips64r2
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  211  # define cpu_has_mips64r2	(cpu_data[0].isa_level & MIPS_CPU_ISA_M64R2)
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  212  #endif
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  213  #ifndef cpu_has_mips64r6
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  214  # define cpu_has_mips64r6	(cpu_data[0].isa_level & MIPS_CPU_ISA_M64R6)
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  215  #endif
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  216  
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  217  /*
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  218   * Shortcuts ...
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  219   */
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  220  #define cpu_has_mips_2_3_4_5	(cpu_has_mips_2 | cpu_has_mips_3_4_5)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  221  #define cpu_has_mips_3_4_5	(cpu_has_mips_3 | cpu_has_mips_4_5)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  222  #define cpu_has_mips_4_5	(cpu_has_mips_4 | cpu_has_mips_5)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  223  
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  224  #define cpu_has_mips_2_3_4_5_r	(cpu_has_mips_2 | cpu_has_mips_3_4_5_r)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  225  #define cpu_has_mips_3_4_5_r	(cpu_has_mips_3 | cpu_has_mips_4_5_r)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  226  #define cpu_has_mips_4_5_r	(cpu_has_mips_4 | cpu_has_mips_5_r)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  227  #define cpu_has_mips_5_r	(cpu_has_mips_5 | cpu_has_mips_r)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  228  
2d83fea78 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  229  #define cpu_has_mips_3_4_5_64_r2_r6					\
2d83fea78 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  230  				(cpu_has_mips_3 | cpu_has_mips_4_5_64_r2_r6)
2d83fea78 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  231  #define cpu_has_mips_4_5_64_r2_r6					\
2d83fea78 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  232  				(cpu_has_mips_4_5 | cpu_has_mips64r1 |	\
2d83fea78 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  233  				 cpu_has_mips_r2 | cpu_has_mips_r6)
08a07904e arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  234  
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  235  #define cpu_has_mips32	(cpu_has_mips32r1 | cpu_has_mips32r2 | cpu_has_mips32r6)
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  236  #define cpu_has_mips64	(cpu_has_mips64r1 | cpu_has_mips64r2 | cpu_has_mips64r6)
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  237  #define cpu_has_mips_r1 (cpu_has_mips32r1 | cpu_has_mips64r1)
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  238  #define cpu_has_mips_r2 (cpu_has_mips32r2 | cpu_has_mips64r2)
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  239  #define cpu_has_mips_r6	(cpu_has_mips32r6 | cpu_has_mips64r6)
c46b302b9 arch/mips/include/asm/cpu-features.h Ralf Baechle      2008-10-28  240  #define cpu_has_mips_r	(cpu_has_mips32r1 | cpu_has_mips32r2 | \
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13 @241  			 cpu_has_mips32r6 | cpu_has_mips64r1 | \
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  242  			 cpu_has_mips64r2 | cpu_has_mips64r6)
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  243  
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  244  /* MIPSR2 and MIPSR6 have a lot of similarities */
34c56fc1c arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  245  #define cpu_has_mips_r2_r6	(cpu_has_mips_r2 | cpu_has_mips_r6)
0401572a9 include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  246  
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  247  /*
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  248   * cpu_has_mips_r2_exec_hazard - return if IHB is required on current processor
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  249   *
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  250   * Returns non-zero value if the current processor implementation requires
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  251   * an IHB instruction to deal with an instruction hazard as per MIPS R2
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  252   * architecture specification, zero otherwise.
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  253   */
41f0e4d04 arch/mips/include/asm/cpu-features.h David Daney       2009-05-12  254  #ifndef cpu_has_mips_r2_exec_hazard
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  255  #define cpu_has_mips_r2_exec_hazard					\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  256  ({									\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  257  	int __res;							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  258  									\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  259  	switch (current_cpu_type()) {					\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  260  	case CPU_M14KC:							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  261  	case CPU_74K:							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  262  	case CPU_1074K:							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  263  	case CPU_PROAPTIV:						\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  264  	case CPU_P5600:							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  265  	case CPU_M5150:							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  266  	case CPU_QEMU_GENERIC:						\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  267  	case CPU_CAVIUM_OCTEON:						\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  268  	case CPU_CAVIUM_OCTEON_PLUS:					\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  269  	case CPU_CAVIUM_OCTEON2:					\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  270  	case CPU_CAVIUM_OCTEON3:					\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  271  		__res = 0;						\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  272  		break;							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  273  									\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  274  	default:							\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  275  		__res = 1;						\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  276  	}								\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  277  									\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  278  	__res;								\
9cdf30bd3 arch/mips/include/asm/cpu-features.h Ralf Baechle      2015-03-25  279  })
41f0e4d04 arch/mips/include/asm/cpu-features.h David Daney       2009-05-12  280  #endif
41f0e4d04 arch/mips/include/asm/cpu-features.h David Daney       2009-05-12  281  
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19  282  /*
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19  283   * MIPS32, MIPS64, VR5500, IDT32332, IDT32334 and maybe a few other
becee6b8c arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2013-09-22  284   * pre-MIPS32/MIPS64 processors have CLO, CLZ.	The IDT RC64574 is 64-bit and
417a5eb02 arch/mips/include/asm/cpu-features.h Ralf Baechle      2010-08-05  285   * has CLO and CLZ but not DCLO nor DCLZ.  For 64-bit kernels
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19  286   * cpu_has_clo_clz also indicates the availability of DCLO and DCLZ.
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19  287   */
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19  288  #ifndef cpu_has_clo_clz
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19 @289  #define cpu_has_clo_clz	cpu_has_mips_r
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19  290  #endif
35e1a24e8 arch/mips/include/asm/cpu-features.h Zhaoxiu Zeng      2016-04-28 @291  #if !cpu_has_clo_clz
35e1a24e8 arch/mips/include/asm/cpu-features.h Zhaoxiu Zeng      2016-04-28  292  #define CONFIG_CPU_NO_EFFICIENT_FFS 1
35e1a24e8 arch/mips/include/asm/cpu-features.h Zhaoxiu Zeng      2016-04-28  293  #endif
47740eb88 arch/mips/include/asm/cpu-features.h Ralf Baechle      2009-04-19  294  

:::::: The code at line 205 was first introduced by commit
:::::: 34c56fc1c167facc375d927687df0a3891d164ac MIPS: asm: cpu: Add MIPSR6 ISA definitions

:::::: TO: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
:::::: CC: Markos Chandras <markos.chandras@imgtec.com>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Josh Juran April 28, 2016, 5:10 p.m. UTC | #2
On Apr 28, 2016, at 7:43 AM, zengzhaoxiu@163.com wrote:

> + * This implements the binary GCD algorithm. (Often attributed to Stein,
> + * but as Knuth has noted, appears a first-century Chinese math text.)

Should this be "appears in a"?

Josh

--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley April 28, 2016, 5:22 p.m. UTC | #3
On Thu, 2016-04-28 at 19:43 +0800, zengzhaoxiu@163.com wrote:
> From: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
> 
> Because some architectures (alpha, armv6, etc.) don't provide 
> hardware division, the mod operation is slow! Binary GCD algorithm 
> uses simple arithmetic operations, it replaces division with 
> arithmetic shifts, comparisons, and subtractions.
> 
> I have compiled successfully with x86_64_defconfig and 
> i386_defconfig.

What's the reason for wanting to optimize this and thus have to
maintain (and test) two separate code paths, which is a significant
expense? As far as I can see, gcd() is mosly used in finding optimal
clocks for operations, which is usually done at start of day and not
time critical.

James


--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Geert Uytterhoeven April 28, 2016, 5:51 p.m. UTC | #4
On Thu, Apr 28, 2016 at 6:48 PM, George Spelvin <linux@horizon.com> wrote:
> Another few comments:
>
> 1. Would ARCH_HAS_FAST_FFS involve fewer changes than CPU_NO_EFFICIENT_FFS?

No, as you want to _disable_ ARCH_HAS_FAST_FFS / _enable_
CPU_NO_EFFICIENT_FFS as soon as you're enabling support for a
CPU that doesn't support it.

Logical OR is easier in both the Kconfig and C preprocessor languages
than logical NAND.

E.g. in Kconfig, a CPU core not supporting it can just select
CPU_NO_EFFICIENT_FFS.

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds
--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
dalias@libc.org April 28, 2016, 5:58 p.m. UTC | #5
On Thu, Apr 28, 2016 at 07:51:06PM +0200, Geert Uytterhoeven wrote:
> On Thu, Apr 28, 2016 at 6:48 PM, George Spelvin <linux@horizon.com> wrote:
> > Another few comments:
> >
> > 1. Would ARCH_HAS_FAST_FFS involve fewer changes than CPU_NO_EFFICIENT_FFS?
> 
> No, as you want to _disable_ ARCH_HAS_FAST_FFS / _enable_
> CPU_NO_EFFICIENT_FFS as soon as you're enabling support for a
> CPU that doesn't support it.
> 
> Logical OR is easier in both the Kconfig and C preprocessor languages
> than logical NAND.
> 
> E.g. in Kconfig, a CPU core not supporting it can just select
> CPU_NO_EFFICIENT_FFS.

How does a CPU lack an efficient ffs/ctz anyway? There are all sorts
of ways to implement it without a native insn, some of which are
almost or just as fast as the native insn on cpus that have the
latter. On anything with a fast multiply, the de Bruijn sequence
approach is near-optimal, and otherwise one of the binary-search type
approaches (possibly branchless) can be used. If the compiler doesn't
generate an appropriate one for __builtin_ctz, that's arguably a
compiler bug.

Rich
--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Geert Uytterhoeven April 28, 2016, 6:11 p.m. UTC | #6
On Thu, Apr 28, 2016 at 7:58 PM, Rich Felker <dalias@libc.org> wrote:
> On Thu, Apr 28, 2016 at 07:51:06PM +0200, Geert Uytterhoeven wrote:
>> On Thu, Apr 28, 2016 at 6:48 PM, George Spelvin <linux@horizon.com> wrote:
>> > Another few comments:
>> >
>> > 1. Would ARCH_HAS_FAST_FFS involve fewer changes than CPU_NO_EFFICIENT_FFS?
>>
>> No, as you want to _disable_ ARCH_HAS_FAST_FFS / _enable_
>> CPU_NO_EFFICIENT_FFS as soon as you're enabling support for a
>> CPU that doesn't support it.
>>
>> Logical OR is easier in both the Kconfig and C preprocessor languages
>> than logical NAND.
>>
>> E.g. in Kconfig, a CPU core not supporting it can just select
>> CPU_NO_EFFICIENT_FFS.
>
> How does a CPU lack an efficient ffs/ctz anyway? There are all sorts
> of ways to implement it without a native insn, some of which are
> almost or just as fast as the native insn on cpus that have the
> latter. On anything with a fast multiply, the de Bruijn sequence
> approach is near-optimal, and otherwise one of the binary-search type
> approaches (possibly branchless) can be used. If the compiler doesn't
> generate an appropriate one for __builtin_ctz, that's arguably a
> compiler bug.

m68k-linux-gcc 4.6.3 generates:

        jsr __ctzsi2

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds
--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
George Spelvin April 28, 2016, 7:15 p.m. UTC | #7
> How does a CPU lack an efficient ffs/ctz anyway? There are all sorts
> of ways to implement it without a native insn, some of which are
> almost or just as fast as the native insn on cpus that have the
> latter. On anything with a fast multiply, the de Bruijn sequence
> approach is near-optimal, and otherwise one of the binary-search type
> approaches (possibly branchless) can be used. If the compiler doesn't
> generate an appropriate one for __builtin_ctz, that's arguably a
> compiler bug.

What's wanted here is something faster than any of those.
Yes, there's a simple constant-time branch-free implementation:

unsigned inline __attribute__((const))
hweight32(uint32_t x)
{
	x -= (x >> 1) & 0x55555555;
	x  = ((x >> 2) & 0x33333333) + (x & 0x33333333);
	x += x >> 4;
	x &= 0x0f0f0f0f;
	x += x >> 8;
	x += x >> 16;
	return x & 63;
}

unsigned inline __attribute__((const))
__ffs32(uint32_t x)
{
	return hweight(~x & (x-1));
}

but if you work it through, that's about 19 instructions; a few more on
platforms without 32-bit immediates.  The shift itself makes an even 20,
and there are a lot of sequential dependencies (I count a 17-op chain
including the shift) limiting execution time.

The de Bruijn hack reduces the length but adds a memory access for
the table lookup.  (http://supertech.csail.mit.edu/papers/debruijn.pdf)

In the GCD code, the number to normalize is basically random, so the
normalization loop shifts an average of 1 bit.  One bit half the time,
a second bit 1/4 of the time, etc.

(The posted code in the FAST_FFS case omits one guaranteed shift at the
end of the loop because the normalization code is constant-time.)

So "fast __ffs" basically means faster than *one* iteration of
"while (!(x & 1)) x >>= 1;".

In this case "fast" means cheaper than *one* unpredictable branch, which
is a very small handful of instructions.
--
To unsubscribe from this list: send the line "unsubscribe linux-sh" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
kernel test robot May 20, 2016, 10:27 a.m. UTC | #8
Hi,

[auto build test ERROR on v4.6-rc5]
[cannot apply to next-20160519]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/zengzhaoxiu-163-com/lib-GCD-add-binary-GCD-algorithm/20160428-195527
config: mips-jz4740 (attached as .config)
compiler: mips-linux-gnu-gcc (Debian 5.3.1-8) 5.3.1 20160205
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=mips 

Note: the linux-review/zengzhaoxiu-163-com/lib-GCD-add-binary-GCD-algorithm/20160428-195527 HEAD 35e1a24e8fc3a5308b053ed3c744f02ec2a76820 builds fine.
      It only hurts bisectibility.

All errors (new ones prefixed by >>):

   In file included from arch/mips/include/asm/bitops.h:21:0,
                    from include/linux/bitops.h:36,
                    from include/linux/kernel.h:10,
                    from include/asm-generic/bug.h:13,
                    from arch/mips/include/asm/bug.h:41,
                    from include/linux/bug.h:4,
                    from include/linux/page-flags.h:9,
                    from kernel/bounds.c:9:
   arch/mips/include/asm/cpu-features.h:205:28: warning: "cpu_data" is not defined [-Wundef]
    # define cpu_has_mips32r6 (cpu_data[0].isa_level & MIPS_CPU_ISA_M32R6)
                               ^
   arch/mips/include/asm/cpu-features.h:241:5: note: in expansion of macro 'cpu_has_mips32r6'
        cpu_has_mips32r6 | cpu_has_mips64r1 | \
        ^
   arch/mips/include/asm/cpu-features.h:289:25: note: in expansion of macro 'cpu_has_mips_r'
    #define cpu_has_clo_clz cpu_has_mips_r
                            ^
   arch/mips/include/asm/cpu-features.h:291:6: note: in expansion of macro 'cpu_has_clo_clz'
    #if !cpu_has_clo_clz
         ^
>> arch/mips/include/asm/cpu-features.h:205:36: error: token "[" is not valid in preprocessor expressions
    # define cpu_has_mips32r6 (cpu_data[0].isa_level & MIPS_CPU_ISA_M32R6)
                                       ^
   arch/mips/include/asm/cpu-features.h:241:5: note: in expansion of macro 'cpu_has_mips32r6'
        cpu_has_mips32r6 | cpu_has_mips64r1 | \
        ^
   arch/mips/include/asm/cpu-features.h:289:25: note: in expansion of macro 'cpu_has_mips_r'
    #define cpu_has_clo_clz cpu_has_mips_r
                            ^
   arch/mips/include/asm/cpu-features.h:291:6: note: in expansion of macro 'cpu_has_clo_clz'
    #if !cpu_has_clo_clz
         ^
   make[2]: *** [kernel/bounds.s] Error 1
   make[2]: Target '__build' not remade because of errors.
   make[1]: *** [prepare0] Error 2
   make[1]: Target 'prepare' not remade because of errors.
   make: *** [sub-make] Error 2

vim +205 arch/mips/include/asm/cpu-features.h

0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  199  # define cpu_has_mips32r1	(cpu_data[0].isa_level & MIPS_CPU_ISA_M32R1)
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  200  #endif
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  201  #ifndef cpu_has_mips32r2
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  202  # define cpu_has_mips32r2	(cpu_data[0].isa_level & MIPS_CPU_ISA_M32R2)
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  203  #endif
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  204  #ifndef cpu_has_mips32r6
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13 @205  # define cpu_has_mips32r6	(cpu_data[0].isa_level & MIPS_CPU_ISA_M32R6)
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  206  #endif
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  207  #ifndef cpu_has_mips64r1
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  208  # define cpu_has_mips64r1	(cpu_data[0].isa_level & MIPS_CPU_ISA_M64R1)
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  209  #endif
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  210  #ifndef cpu_has_mips64r2
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  211  # define cpu_has_mips64r2	(cpu_data[0].isa_level & MIPS_CPU_ISA_M64R2)
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  212  #endif
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  213  #ifndef cpu_has_mips64r6
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  214  # define cpu_has_mips64r6	(cpu_data[0].isa_level & MIPS_CPU_ISA_M64R6)
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  215  #endif
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  216  
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  217  /*
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  218   * Shortcuts ...
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  219   */
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  220  #define cpu_has_mips_2_3_4_5	(cpu_has_mips_2 | cpu_has_mips_3_4_5)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  221  #define cpu_has_mips_3_4_5	(cpu_has_mips_3 | cpu_has_mips_4_5)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  222  #define cpu_has_mips_4_5	(cpu_has_mips_4 | cpu_has_mips_5)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  223  
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  224  #define cpu_has_mips_2_3_4_5_r	(cpu_has_mips_2 | cpu_has_mips_3_4_5_r)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  225  #define cpu_has_mips_3_4_5_r	(cpu_has_mips_3 | cpu_has_mips_4_5_r)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  226  #define cpu_has_mips_4_5_r	(cpu_has_mips_4 | cpu_has_mips_5_r)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  227  #define cpu_has_mips_5_r	(cpu_has_mips_5 | cpu_has_mips_r)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  228  
2d83fea7 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  229  #define cpu_has_mips_3_4_5_64_r2_r6					\
2d83fea7 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  230  				(cpu_has_mips_3 | cpu_has_mips_4_5_64_r2_r6)
2d83fea7 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  231  #define cpu_has_mips_4_5_64_r2_r6					\
2d83fea7 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  232  				(cpu_has_mips_4_5 | cpu_has_mips64r1 |	\
2d83fea7 arch/mips/include/asm/cpu-features.h Maciej W. Rozycki 2015-04-03  233  				 cpu_has_mips_r2 | cpu_has_mips_r6)
08a07904 arch/mips/include/asm/cpu-features.h Ralf Baechle      2014-04-19  234  
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  235  #define cpu_has_mips32	(cpu_has_mips32r1 | cpu_has_mips32r2 | cpu_has_mips32r6)
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  236  #define cpu_has_mips64	(cpu_has_mips64r1 | cpu_has_mips64r2 | cpu_has_mips64r6)
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  237  #define cpu_has_mips_r1 (cpu_has_mips32r1 | cpu_has_mips64r1)
0401572a include/asm-mips/cpu-features.h      Ralf Baechle      2005-12-09  238  #define cpu_has_mips_r2 (cpu_has_mips32r2 | cpu_has_mips64r2)
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  239  #define cpu_has_mips_r6	(cpu_has_mips32r6 | cpu_has_mips64r6)
c46b302b arch/mips/include/asm/cpu-features.h Ralf Baechle      2008-10-28  240  #define cpu_has_mips_r	(cpu_has_mips32r1 | cpu_has_mips32r2 | \
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13 @241  			 cpu_has_mips32r6 | cpu_has_mips64r1 | \
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  242  			 cpu_has_mips64r2 | cpu_has_mips64r6)
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  243  
34c56fc1 arch/mips/include/asm/cpu-features.h Leonid Yegoshin   2014-11-13  244  /* MIPSR2 and MIPSR6 have a lot of similarities */

:::::: The code at line 205 was first introduced by commit
:::::: 34c56fc1c167facc375d927687df0a3891d164ac MIPS: asm: cpu: Add MIPSR6 ISA definitions

:::::: TO: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
:::::: CC: Markos Chandras <markos.chandras@imgtec.com>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
diff mbox

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5..275f17d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -638,4 +638,7 @@  config COMPAT_OLD_SIGACTION
 config ARCH_NO_COHERENT_DMA_MMAP
 	bool
 
+config CPU_NO_EFFICIENT_FFS
+	def_bool n
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 9d8a858..44e6f05 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -27,6 +27,7 @@  config ALPHA
 	select MODULES_USE_ELF_RELA
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
+	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
 	help
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 5534766..cb569b6 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -421,18 +421,21 @@  config CPU_32v3
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4T
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v5
 	bool
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 986ea84..aa232de 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -20,6 +20,7 @@  config H8300
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_ARCH_KGDB
+	select CPU_NO_EFFICIENT_FFS
 
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index c82b292..3cc8498 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -17,6 +17,7 @@  config M32R
 	select ARCH_USES_GETTIMEOFFSET
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
+	select CPU_NO_EFFICIENT_FFS
 
 config SBUS
 	bool
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index 0dfcf12..0b6efe8 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -40,6 +40,7 @@  config M68000
 	select CPU_HAS_NO_MULDIV64
 	select CPU_HAS_NO_UNALIGNED
 	select GENERIC_CSUM
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was Motorola) 68000 CPU is the first generation of
 	  the well known M68K family of processors. The CPU core as well as
@@ -51,6 +52,7 @@  config MCPU32
 	bool
 	select CPU_HAS_NO_BITFIELDS
 	select CPU_HAS_NO_UNALIGNED
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The Freescale (was then Motorola) CPU32 is a CPU core that is
 	  based on the 68020 processor. For the most part it is used in
@@ -130,6 +132,7 @@  config M5206
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206 processor support.
 
@@ -138,6 +141,7 @@  config M5206e
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5206e processor support.
 
@@ -163,6 +167,7 @@  config M5249
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5249 processor support.
 
@@ -171,6 +176,7 @@  config M525x
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale (Motorola) Coldfire 5251/5253 processor support.
 
@@ -189,6 +195,7 @@  config M5272
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5272 processor support.
 
@@ -217,6 +224,7 @@  config M5307
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5307 processor support.
 
@@ -242,6 +250,7 @@  config M5407
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Motorola ColdFire 5407 processor support.
 
@@ -251,6 +260,7 @@  config M547x
 	select MMU_COLDFIRE if MMU
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
 
@@ -260,6 +270,7 @@  config M548x
 	select M54xx
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
 
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index a0fa88d..2ac2de6 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -29,6 +29,7 @@  config METAG
 	select OF
 	select OF_EARLY_FLATTREE
 	select SPARSE_IRQ
+	select CPU_NO_EFFICIENT_FFS
 
 config STACKTRACE_SUPPORT
 	def_bool y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 3d793b5..f17c3a4 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -32,6 +32,7 @@  config MICROBLAZE
 	select OF_EARLY_FLATTREE
 	select TRACING_SUPPORT
 	select VIRT_TO_BUS
+	select CPU_NO_EFFICIENT_FFS
 
 config SWAP
 	def_bool n
diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index eeec8c8..fd4ae7d 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -288,6 +288,9 @@ 
 #ifndef cpu_has_clo_clz
 #define cpu_has_clo_clz	cpu_has_mips_r
 #endif
+#if !cpu_has_clo_clz
+#define CONFIG_CPU_NO_EFFICIENT_FFS 1
+#endif
 
 /*
  * MIPS32 R2, MIPS64 R2, Loongson 3A and Octeon have WSBH.
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 4375554..f10bd2c 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -16,6 +16,7 @@  config NIOS2
 	select SOC_BUS
 	select SPARSE_IRQ
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
+	select CPU_NO_EFFICIENT_FFS
 
 config GENERIC_CSUM
 	def_bool y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index e118c02..142cb05 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -25,6 +25,7 @@  config OPENRISC
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
 	select OR1K_PIC
+	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
 
 config MMU
 	def_bool y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 88cfaa8..3d498a6 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -32,6 +32,7 @@  config PARISC
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_SECCOMP_FILTER
 	select ARCH_NO_COHERENT_DMA_MMAP
+	select CPU_NO_EFFICIENT_FFS
 
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 366e1b5..507d631 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -14,6 +14,7 @@  config SCORE
 	select VIRT_TO_BUS
 	select MODULES_USE_ELF_REL
 	select CLONE_BACKWARDS
+	select CPU_NO_EFFICIENT_FFS
 
 choice
 	prompt "System type"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7ed20fc..56cf5e5 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@  config SUPERH
 	select OLD_SIGSUSPEND
 	select OLD_SIGACTION
 	select HAVE_ARCH_AUDITSYSCALL
+	select CPU_NO_EFFICIENT_FFS
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 57ffaf2..ca675ed 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -42,6 +42,7 @@  config SPARC
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
 	select ARCH_HAS_SG_CHAIN
+	select CPU_NO_EFFICIENT_FFS
 
 config SPARC32
 	def_bool !64BIT
diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f12..eba7d4e 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,20 +2,68 @@ 
 #include <linux/gcd.h>
 #include <linux/export.h>
 
-/* Greatest common divisor */
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knuth has noted, appears a first-century Chinese math text.)
+ *
+ * This is faster than the division-based algorithm even on x86, which
+ * has decent hardware division.
+ */
+
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+
+/* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
 {
-	unsigned long r;
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
 
-	if (a < b)
-		swap(a, b);
+	b >>= __ffs(b);
 
-	if (!b)
-		return a;
-	while ((r = a % b) != 0) {
-		a = b;
-		b = r;
+	for (;;) {
+		a >>= __ffs(a);
+		if (a == b)
+			return a << __ffs(r);
+		if (a < b)
+			swap(a, b);
+		a -= b;
 	}
+}
+
+#else
+
+/* If normalization is done by loops, the even/odd algorithm is a win. */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
+
+	/* Isolate lsbit of r */
+	r &= -r;
+
+	while (!(a & r))
+		a >>= 1;
+	while (!(b & r))
+		b >>= 1;
+
+	while (a != b) {
+		if (a < b)
+			swap(a, b);
+		a -= b;
+
+		a >>= 1;
+		if (a & r)
+			a += b;
+		do a >>= 1; while (!(a & r));
+	}
+
 	return b;
 }
+
+#endif
+
 EXPORT_SYMBOL_GPL(gcd);