Message ID | 1462527763-15301-1-git-send-email-zengzhaoxiu@163.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Nothing critical, but a bit of kibitzing. (That is slang in the Yiddish language for a person who offers annoying and unwanted advice.) > The binary GCD algorithm is based on the following facts: > 1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2) > 2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b) > 3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b) 1) "even" and "odd" are adjectives. In English, adjectives to not have plural suffixes. Thus, "they are even" or "they are odd". 2) Although "all" is not exactly wrong, it sounds odd. Since there are exactly two of them it's clearer to say "both". If I also rephrase the last line to fit into 80 columns, you get: The binary GCD algorithm is based on the following facts: - 1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2) + 1. If a and b are both even, then gcd(a,b) = 2 * gcd(a/2, b/2) 2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b) - 3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b) + 3. If both are odd, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b) 3) Negative config options are always confusing. Would it be better to call it CONFIG_INEFFICIEBNT_FFS, or even simpler CONFIG_SLOW_FFS? Also, you're allowed to add "help" to a non-interactive config option, and some documentation might be useful. E.g. +config CPU_SLOW_FFS + def_bool n + help + If n, the CPU supports a fast __ffs (__builtin_ctz) operation, + either directly or via a short code sequence using a count + leading zeros or population count instruction. If y, the + operation is emulated by slower software, such as an unrolled + binary search. + + This is purely an optimization option; the kernel + will function correctly regardless of how it is set. + Your benchmark code doesn't have to have a separate code path if __x86_64__; rdtsc works on 32-bit code just as well. paths. And the way you subtract the end and start times is unnecessarily complicated. The C language guarantees that unsigned arithmetic simply wraps modulo 2^bits as expected. Here's a simplified version: #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <string.h> #include <time.h> #include <unistd.h> #define swap(a, b) \ do { \ a ^= b; \ b ^= a; \ a ^= b; \ } while (0) /* The Euclidean GCD algorithm */ unsigned long gcd0(unsigned long a, unsigned long b) { if (a < b) swap(a, b); while (b != 0) { unsigned long r = a % b; a = b; b = r; } return a; } /* The binary GCD algorithm, using __builtin_ctzl */ unsigned long gcd1(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; b >>= __builtin_ctzl(b); do { a >>= __builtin_ctzl(a); if (a < b) swap(a, b); a -= b; } while (a); return b << __builtin_ctzl(r); } /* Binary GCD algorithm, even/odd variant, without __builtin_ctzl */ unsigned long gcd2(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; r &= -r; while (!(b & r)) b >>= 1; for (;;) { while (!(a & r)) a >>= 1; if (a < b) swap(a, b); else if (a == b) return a; a -= b; a >>= 1; if (a & r) a += b; a >>= 1; } } /* A variant of gcd1, with early out for gcd = 1 */ unsigned long gcd3(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; b >>= __builtin_ctzl(b); if (b == 1) return r & -r; for (;;) { a >>= __builtin_ctzl(a); if (a == b || a == 1) return a << __builtin_ctzl(r); if (a < b) swap(a, b); a -= b; } } unsigned long gcd4(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; r &= -r; while (!(b & r)) b >>= 1; if (b == r) return r; for (;;) { while (!(a & r)) a >>= 1; if (a == b || a == r) return a; if (a < b) swap(a, b); a -= b; a >>= 1; if (a & r) a += b; a >>= 1; } } static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = { gcd0, gcd1, gcd2, gcd3, gcd4, }; #define TEST_ENTRIES (int)(sizeof(gcd_func) / sizeof(gcd_func[0])) #define rdtscll(val) do { \ unsigned __a,__d; \ __asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \ (val) = __a | (unsigned long long)__d << 32; \ } while(0) static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long), unsigned long a, unsigned long b, unsigned long *res) { unsigned long long start, end; unsigned long gcd_res; rdtscll(start); gcd_res = gcd(a, b); rdtscll(end); *res = gcd_res; return end - start; } static inline unsigned long get_rand() { if (sizeof(long) == 8) return (unsigned long)rand() << 32 | rand(); else return rand(); } int main(int argc, char **argv) { unsigned int seed = time(0); int loops = 100; int repeats = 1000; unsigned long (*res)[TEST_ENTRIES]; unsigned long long elapsed[TEST_ENTRIES]; int i, j, k; for (;;) { int opt = getopt(argc, argv, "n:r:s:"); /* End condition always first */ if (opt == -1) break; switch (opt) { case 'n': loops = atoi(optarg); break; case 'r': repeats = atoi(optarg); break; case 's': seed = strtoul(optarg, NULL, 10); break; default: /* You won't actually get here. */ break; } } res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops); memset(elapsed, 0, sizeof(elapsed)); srand(seed); for (j = 0; j < loops; j++) { unsigned long a = get_rand(); /* Do we have args? */ unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand(); unsigned long long min_elapsed[TEST_ENTRIES]; for (k = 0; k < repeats; k++) { for (i = 0; i < TEST_ENTRIES; i++) { unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]); if (k == 0 || min_elapsed[i] > tmp) min_elapsed[i] = tmp; } } for (i = 0; i < TEST_ENTRIES; i++) elapsed[i] += min_elapsed[i]; } for (i = 0; i < TEST_ENTRIES; i++) printf("gcd%d: elapsed %llu\n", i, elapsed[i]); k = 0; srand(seed); for (j = 0; j < loops; j++) { unsigned long a = get_rand(); unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand(); for (i = 1; i < TEST_ENTRIES; i++) { if (res[j][i] != res[j][0]) break; } if (i < TEST_ENTRIES) { if (k == 0) { k = 1; fprintf(stderr, "Error:\n"); } fprintf(stderr, "gcd(%lu, %lu): ", a, b); for (i = 0; i < TEST_ENTRIES; i++) fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n"); } } if (k == 0) fprintf(stderr, "PASS\n"); free(res); return 0; } Here are some more timings, with the same flags as your tests: First, 32 bit code: gcd0 gcd1 gcd2 gcd3 gcd4 Ivy Bridge 3156 1192 1740 1160 1640 PASS AMD Phenom 7150 2564 2348 2975 2843 PASS Core 2 4176 2592 4164 2604 3900 PASS Pentium 4 11492 4784 7632 4852 6452 PASS And 64-bit (longer times becuase the inputs are larger): Ivy Bridge 10636 2496 3500 2432 3360 PASS AMD Phenom 19482 4058 6030 5001 6845 PASS Looking at those, I'm not sure how much better the gcd3/4 versions are than gcd1/2. The difference seems pretty minor and sometimes negative. -- To unsubscribe from this list: send the line "unsubscribe linux-sh" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
"George Spelvin" <linux@horizon.com> writes: > Your benchmark code doesn't have to have a separate code path if > __x86_64__; rdtsc works on 32-bit code just as well. Take a look at the CC: list. Andreas.
> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig > index 57ffaf2..ca675ed 100644 > --- a/arch/sparc/Kconfig > +++ b/arch/sparc/Kconfig > @@ -42,6 +42,7 @@ config SPARC > select ODD_RT_SIGACTION > select OLD_SIGSUSPEND > select ARCH_HAS_SG_CHAIN > + select CPU_NO_EFFICIENT_FFS > > config SPARC32 > def_bool !64BIT sparc64 have an efficient ffs implementation. We use run-time patching to use the proper version depending on the actual sparc cpu. As this is determinded at config time, then let the sparc cpu that has the efficient ffs benefit from this. In other words - select CPU_NO_EFFICIENT_FFS only for SPARC32. Sam -- To unsubscribe from this list: send the line "unsubscribe linux-sh" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
? 2016/5/7 16:41, George Spelvin ??: > Nothing critical, but a bit of kibitzing. > (That is slang in the Yiddish language for a person > who offers annoying and unwanted advice.) > >> The binary GCD algorithm is based on the following facts: >> 1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2) >> 2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b) >> 3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b) > 1) "even" and "odd" are adjectives. In English, adjectives to not have > plural suffixes. Thus, "they are even" or "they are odd". > 2) Although "all" is not exactly wrong, it sounds odd. Since there are > exactly two of them it's clearer to say "both". > > If I also rephrase the last line to fit into 80 columns, you get: > > The binary GCD algorithm is based on the following facts: > - 1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2) > + 1. If a and b are both even, then gcd(a,b) = 2 * gcd(a/2, b/2) > 2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b) > - 3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b) > + 3. If both are odd, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b) > > 3) Negative config options are always confusing. > > Would it be better to call it CONFIG_INEFFICIEBNT_FFS, or even simpler > CONFIG_SLOW_FFS? > > Also, you're allowed to add "help" to a non-interactive config option, > and some documentation might be useful. > E.g. > > +config CPU_SLOW_FFS > + def_bool n > + help > + If n, the CPU supports a fast __ffs (__builtin_ctz) operation, > + either directly or via a short code sequence using a count > + leading zeros or population count instruction. If y, the > + operation is emulated by slower software, such as an unrolled > + binary search. > + > + This is purely an optimization option; the kernel > + will function correctly regardless of how it is set. > + Thanks a lot. > Your benchmark code doesn't have to have a separate code path if > __x86_64__; rdtsc works on 32-bit code just as well. paths. And the > way you subtract the end and start times is unnecessarily complicated. > The C language guarantees that unsigned arithmetic simply wraps modulo > 2^bits as expected. rdsc works on x86, the other path is prepared for other architectures. > Here are some more timings, with the same flags as your tests: > > First, 32 bit code: > gcd0 gcd1 gcd2 gcd3 gcd4 > Ivy Bridge 3156 1192 1740 1160 1640 PASS > AMD Phenom 7150 2564 2348 2975 2843 PASS > Core 2 4176 2592 4164 2604 3900 PASS > Pentium 4 11492 4784 7632 4852 6452 PASS > > And 64-bit (longer times becuase the inputs are larger): > Ivy Bridge 10636 2496 3500 2432 3360 PASS > AMD Phenom 19482 4058 6030 5001 6845 PASS > > Looking at those, I'm not sure how much better the gcd3/4 versions are > than gcd1/2. The difference seems pretty minor and sometimes negative. > The worst case of binary GCD is that one number is power of 2, for example a is 0xffffffff (0xcccccccc for the "even/odd" variant) and b is 1. The gcd3/4 versions can handle this properly. -- To unsubscribe from this list: send the line "unsubscribe linux-sh" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/Kconfig b/arch/Kconfig index 81869a5..275f17d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -638,4 +638,7 @@ config COMPAT_OLD_SIGACTION config ARCH_NO_COHERENT_DMA_MMAP bool +config CPU_NO_EFFICIENT_FFS + def_bool n + source "kernel/gcov/Kconfig" diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 9d8a858..44e6f05 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -27,6 +27,7 @@ config ALPHA select MODULES_USE_ELF_RELA select ODD_RT_SIGACTION select OLD_SIGSUSPEND + select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67 help The Alpha is a 64-bit general-purpose processor designed and marketed by the Digital Equipment Corporation of blessed memory, diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index ec4791e..f41eb4c 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -101,6 +101,7 @@ choice config ISA_ARCOMPACT bool "ARCompact ISA" + select CPU_NO_EFFICIENT_FFS help The original ARC ISA of ARC600/700 cores diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 5534766..cb569b6 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -421,18 +421,21 @@ config CPU_32v3 select CPU_USE_DOMAINS if MMU select NEED_KUSER_HELPERS select TLS_REG_EMUL if SMP || !MMU + select CPU_NO_EFFICIENT_FFS config CPU_32v4 bool select CPU_USE_DOMAINS if MMU select NEED_KUSER_HELPERS select TLS_REG_EMUL if SMP || !MMU + select CPU_NO_EFFICIENT_FFS config CPU_32v4T bool select CPU_USE_DOMAINS if MMU select NEED_KUSER_HELPERS select TLS_REG_EMUL if SMP || !MMU + select CPU_NO_EFFICIENT_FFS config CPU_32v5 bool diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 986ea84..aa232de 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -20,6 +20,7 @@ config H8300 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZO select HAVE_ARCH_KGDB + select CPU_NO_EFFICIENT_FFS config RWSEM_GENERIC_SPINLOCK def_bool y diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index c82b292..3cc8498 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -17,6 +17,7 @@ config M32R select ARCH_USES_GETTIMEOFFSET select MODULES_USE_ELF_RELA select HAVE_DEBUG_STACKOVERFLOW + select CPU_NO_EFFICIENT_FFS config SBUS bool diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu index 0dfcf12..0b6efe8 100644 --- a/arch/m68k/Kconfig.cpu +++ b/arch/m68k/Kconfig.cpu @@ -40,6 +40,7 @@ config M68000 select CPU_HAS_NO_MULDIV64 select CPU_HAS_NO_UNALIGNED select GENERIC_CSUM + select CPU_NO_EFFICIENT_FFS help The Freescale (was Motorola) 68000 CPU is the first generation of the well known M68K family of processors. The CPU core as well as @@ -51,6 +52,7 @@ config MCPU32 bool select CPU_HAS_NO_BITFIELDS select CPU_HAS_NO_UNALIGNED + select CPU_NO_EFFICIENT_FFS help The Freescale (was then Motorola) CPU32 is a CPU core that is based on the 68020 processor. For the most part it is used in @@ -130,6 +132,7 @@ config M5206 depends on !MMU select COLDFIRE_SW_A7 select HAVE_MBAR + select CPU_NO_EFFICIENT_FFS help Motorola ColdFire 5206 processor support. @@ -138,6 +141,7 @@ config M5206e depends on !MMU select COLDFIRE_SW_A7 select HAVE_MBAR + select CPU_NO_EFFICIENT_FFS help Motorola ColdFire 5206e processor support. @@ -163,6 +167,7 @@ config M5249 depends on !MMU select COLDFIRE_SW_A7 select HAVE_MBAR + select CPU_NO_EFFICIENT_FFS help Motorola ColdFire 5249 processor support. @@ -171,6 +176,7 @@ config M525x depends on !MMU select COLDFIRE_SW_A7 select HAVE_MBAR + select CPU_NO_EFFICIENT_FFS help Freescale (Motorola) Coldfire 5251/5253 processor support. @@ -189,6 +195,7 @@ config M5272 depends on !MMU select COLDFIRE_SW_A7 select HAVE_MBAR + select CPU_NO_EFFICIENT_FFS help Motorola ColdFire 5272 processor support. @@ -217,6 +224,7 @@ config M5307 select COLDFIRE_SW_A7 select HAVE_CACHE_CB select HAVE_MBAR + select CPU_NO_EFFICIENT_FFS help Motorola ColdFire 5307 processor support. @@ -242,6 +250,7 @@ config M5407 select COLDFIRE_SW_A7 select HAVE_CACHE_CB select HAVE_MBAR + select CPU_NO_EFFICIENT_FFS help Motorola ColdFire 5407 processor support. @@ -251,6 +260,7 @@ config M547x select MMU_COLDFIRE if MMU select HAVE_CACHE_CB select HAVE_MBAR + select CPU_NO_EFFICIENT_FFS help Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support. @@ -260,6 +270,7 @@ config M548x select M54xx select HAVE_CACHE_CB select HAVE_MBAR + select CPU_NO_EFFICIENT_FFS help Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support. diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig index a0fa88d..2ac2de6 100644 --- a/arch/metag/Kconfig +++ b/arch/metag/Kconfig @@ -29,6 +29,7 @@ config METAG select OF select OF_EARLY_FLATTREE select SPARSE_IRQ + select CPU_NO_EFFICIENT_FFS config STACKTRACE_SUPPORT def_bool y diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 3d793b5..f17c3a4 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -32,6 +32,7 @@ config MICROBLAZE select OF_EARLY_FLATTREE select TRACING_SUPPORT select VIRT_TO_BUS + select CPU_NO_EFFICIENT_FFS config SWAP def_bool n diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h index eeec8c8..e20e100 100644 --- a/arch/mips/include/asm/cpu-features.h +++ b/arch/mips/include/asm/cpu-features.h @@ -180,6 +180,16 @@ #endif #endif +/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */ +#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \ + (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \ + (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \ + (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \ + (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \ + (defined(cpu_has_mips64r6) && cpu_has_mips64r6)) +#define CONFIG_CPU_NO_EFFICIENT_FFS 1 +#endif + #ifndef cpu_has_mips_1 # define cpu_has_mips_1 (!cpu_has_mips_r6) #endif diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 4375554..f10bd2c 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -16,6 +16,7 @@ config NIOS2 select SOC_BUS select SPARSE_IRQ select USB_ARCH_HAS_HCD if USB_SUPPORT + select CPU_NO_EFFICIENT_FFS config GENERIC_CSUM def_bool y diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index e118c02..142cb05 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -25,6 +25,7 @@ config OPENRISC select MODULES_USE_ELF_RELA select HAVE_DEBUG_STACKOVERFLOW select OR1K_PIC + select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1 config MMU def_bool y diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 88cfaa8..3d498a6 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -32,6 +32,7 @@ config PARISC select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_SECCOMP_FILTER select ARCH_NO_COHERENT_DMA_MMAP + select CPU_NO_EFFICIENT_FFS help The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bf24ab1..9eb3932 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -164,7 +164,7 @@ config S390 select TTY select VIRT_CPU_ACCOUNTING select VIRT_TO_BUS - + select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES config SCHED_OMIT_FRAME_POINTER def_bool y diff --git a/arch/score/Kconfig b/arch/score/Kconfig index 366e1b5..507d631 100644 --- a/arch/score/Kconfig +++ b/arch/score/Kconfig @@ -14,6 +14,7 @@ config SCORE select VIRT_TO_BUS select MODULES_USE_ELF_REL select CLONE_BACKWARDS + select CPU_NO_EFFICIENT_FFS choice prompt "System type" diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 7ed20fc..56cf5e5 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -44,6 +44,7 @@ config SUPERH select OLD_SIGSUSPEND select OLD_SIGACTION select HAVE_ARCH_AUDITSYSCALL + select CPU_NO_EFFICIENT_FFS help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 57ffaf2..ca675ed 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -42,6 +42,7 @@ config SPARC select ODD_RT_SIGACTION select OLD_SIGSUSPEND select ARCH_HAS_SG_CHAIN + select CPU_NO_EFFICIENT_FFS config SPARC32 def_bool !64BIT diff --git a/lib/gcd.c b/lib/gcd.c index 3657f12..b9fbe73 100644 --- a/lib/gcd.c +++ b/lib/gcd.c @@ -2,20 +2,77 @@ #include <linux/gcd.h> #include <linux/export.h> -/* Greatest common divisor */ +/* + * This implements the binary GCD algorithm. (Often attributed to Stein, + * but as Knuth has noted, appears in a first-century Chinese math text.) + * + * This is faster than the division-based algorithm even on x86, which + * has decent hardware division. + */ + +#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) + +/* If __ffs is available, the even/odd algorithm benchmarks slower. */ unsigned long gcd(unsigned long a, unsigned long b) { - unsigned long r; + unsigned long r = a | b; + + if (!a || !b) + return r; - if (a < b) - swap(a, b); + b >>= __ffs(b); + if (b == 1) + return r & -r; - if (!b) - return a; - while ((r = a % b) != 0) { - a = b; - b = r; + for (;;) { + a >>= __ffs(a); + if (a == 1) + return r & -r; + if (a == b) + return a << __ffs(r); + + if (a < b) + swap(a, b); + a -= b; } - return b; } + +#else + +/* If normalization is done by loops, the even/odd algorithm is a win. */ +unsigned long gcd(unsigned long a, unsigned long b) +{ + unsigned long r = a | b; + + if (!a || !b) + return r; + + /* Isolate lsbit of r */ + r &= -r; + + while (!(b & r)) + b >>= 1; + if (b == r) + return r; + + for (;;) { + while (!(a & r)) + a >>= 1; + if (a == r) + return r; + if (a == b) + return a; + + if (a < b) + swap(a, b); + a -= b; + a >>= 1; + if (a & r) + a += b; + a >>= 1; + } +} + +#endif + EXPORT_SYMBOL_GPL(gcd);