Message ID | 5CF0F9360200007800233E01@prv1-mh.provo.novell.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | bitops: hweight<N>() improvements | expand |
On 31/05/2019 02:51, Jan Beulich wrote: > Algorithmically this gets us in line with current Linux, where the same > change did happen about 13 years ago. See in particular Linux commits > f9b4192923 ("bitops: hweight() speedup") and 0136611c62 ("optimize > hweight64 for x86_64"). > > Kconfig changes for actually setting HAVE_FAST_MULTIPLY will follow. > > Take the opportunity and change generic_hweight64()'s return type to > unsigned int. > > Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com> > Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> The code in Linux could do with a bit of cleanup. Do you have patches prepared? I do have one further suggestion > --- a/xen/include/xen/bitops.h > +++ b/xen/include/xen/bitops.h > @@ -153,41 +153,54 @@ static __inline__ int get_count_order(un > > static inline unsigned int generic_hweight32(unsigned int w) > { > - unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555); > - res = (res & 0x33333333) + ((res >> 2) & 0x33333333); > - res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F); > - res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF); > - return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF); > + w -= (w >> 1) & 0x55555555; > + w = (w & 0x33333333) + ((w >> 2) & 0x33333333); > + w = (w + (w >> 4)) & 0x0f0f0f0f; > + > +#ifdef CONFIG_HAS_FAST_MULTIPLY > + return (w * 0x01010101) >> 24; > +#else > + w += w >> 8; > + > + return (w + (w >> 16)) & 0xff; > +#endif This would be slightly shorter, less liable to bitrot, and IMO cleaner, to do if ( IS_ENABLED(CONFIG_HAS_FAST_MULTIPLY) ) w = w * 0x01010101) >> 24; else w += w >> 8; return w; which removes the #ifdef-ary fully, and in particular, avoids having different return logic. ~Andrew
>>> On 31.05.19 at 21:40, <andrew.cooper3@citrix.com> wrote: > On 31/05/2019 02:51, Jan Beulich wrote: >> Algorithmically this gets us in line with current Linux, where the same >> change did happen about 13 years ago. See in particular Linux commits >> f9b4192923 ("bitops: hweight() speedup") and 0136611c62 ("optimize >> hweight64 for x86_64"). >> >> Kconfig changes for actually setting HAVE_FAST_MULTIPLY will follow. >> >> Take the opportunity and change generic_hweight64()'s return type to >> unsigned int. >> >> Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> > > Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com> > > The code in Linux could do with a bit of cleanup. Do you have patches > prepared? Not yet, no. I'll try to do this eventually, but it doesn't have a priority for me. >> --- a/xen/include/xen/bitops.h >> +++ b/xen/include/xen/bitops.h >> @@ -153,41 +153,54 @@ static __inline__ int get_count_order(un >> >> static inline unsigned int generic_hweight32(unsigned int w) >> { >> - unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555); >> - res = (res & 0x33333333) + ((res >> 2) & 0x33333333); >> - res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F); >> - res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF); >> - return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF); >> + w -= (w >> 1) & 0x55555555; >> + w = (w & 0x33333333) + ((w >> 2) & 0x33333333); >> + w = (w + (w >> 4)) & 0x0f0f0f0f; >> + >> +#ifdef CONFIG_HAS_FAST_MULTIPLY >> + return (w * 0x01010101) >> 24; >> +#else >> + w += w >> 8; >> + >> + return (w + (w >> 16)) & 0xff; >> +#endif > > This would be slightly shorter, less liable to bitrot, and IMO cleaner, > to do > > if ( IS_ENABLED(CONFIG_HAS_FAST_MULTIPLY) ) > w = w * 0x01010101) >> 24; > else > w += w >> 8; > > return w; > > which removes the #ifdef-ary fully, and in particular, avoids having > different return logic. Hmm, yes, I can switch to such a model, albeit I think this will make Coverity point out unreachable code. Jan
>>> On 31.05.19 at 21:40, <andrew.cooper3@citrix.com> wrote: > On 31/05/2019 02:51, Jan Beulich wrote: >> --- a/xen/include/xen/bitops.h >> +++ b/xen/include/xen/bitops.h >> @@ -153,41 +153,54 @@ static __inline__ int get_count_order(un >> >> static inline unsigned int generic_hweight32(unsigned int w) >> { >> - unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555); >> - res = (res & 0x33333333) + ((res >> 2) & 0x33333333); >> - res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F); >> - res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF); >> - return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF); >> + w -= (w >> 1) & 0x55555555; >> + w = (w & 0x33333333) + ((w >> 2) & 0x33333333); >> + w = (w + (w >> 4)) & 0x0f0f0f0f; >> + >> +#ifdef CONFIG_HAS_FAST_MULTIPLY >> + return (w * 0x01010101) >> 24; >> +#else >> + w += w >> 8; >> + >> + return (w + (w >> 16)) & 0xff; >> +#endif > > This would be slightly shorter, less liable to bitrot, and IMO cleaner, > to do > > if ( IS_ENABLED(CONFIG_HAS_FAST_MULTIPLY) ) > w = w * 0x01010101) >> 24; > else > w += w >> 8; > > return w; Would you be okay with static inline unsigned int generic_hweight32(unsigned int w) { w -= (w >> 1) & 0x55555555; w = (w & 0x33333333) + ((w >> 2) & 0x33333333); w = (w + (w >> 4)) & 0x0f0f0f0f; if ( IS_ENABLED(CONFIG_HAS_FAST_MULTIPLY) ) return (w * 0x01010101) >> 24; w += w >> 8; return (w + (w >> 16)) & 0xff; } despite there then still being two return statements? Jan
On 03/06/2019 11:02, Jan Beulich wrote: >>>> On 31.05.19 at 21:40, <andrew.cooper3@citrix.com> wrote: >> On 31/05/2019 02:51, Jan Beulich wrote: >>> --- a/xen/include/xen/bitops.h >>> +++ b/xen/include/xen/bitops.h >>> @@ -153,41 +153,54 @@ static __inline__ int get_count_order(un >>> >>> static inline unsigned int generic_hweight32(unsigned int w) >>> { >>> - unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555); >>> - res = (res & 0x33333333) + ((res >> 2) & 0x33333333); >>> - res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F); >>> - res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF); >>> - return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF); >>> + w -= (w >> 1) & 0x55555555; >>> + w = (w & 0x33333333) + ((w >> 2) & 0x33333333); >>> + w = (w + (w >> 4)) & 0x0f0f0f0f; >>> + >>> +#ifdef CONFIG_HAS_FAST_MULTIPLY >>> + return (w * 0x01010101) >> 24; >>> +#else >>> + w += w >> 8; >>> + >>> + return (w + (w >> 16)) & 0xff; >>> +#endif >> This would be slightly shorter, less liable to bitrot, and IMO cleaner, >> to do >> >> if ( IS_ENABLED(CONFIG_HAS_FAST_MULTIPLY) ) >> w = w * 0x01010101) >> 24; >> else >> w += w >> 8; >> >> return w; > Would you be okay with > > static inline unsigned int generic_hweight32(unsigned int w) > { > w -= (w >> 1) & 0x55555555; > w = (w & 0x33333333) + ((w >> 2) & 0x33333333); > w = (w + (w >> 4)) & 0x0f0f0f0f; > > if ( IS_ENABLED(CONFIG_HAS_FAST_MULTIPLY) ) > return (w * 0x01010101) >> 24; > > w += w >> 8; > > return (w + (w >> 16)) & 0xff; > } > > despite there then still being two return statements? Yeah - this form does read like a regular function. ~Andrew
--- a/xen/common/Kconfig +++ b/xen/common/Kconfig @@ -31,6 +31,9 @@ config HAS_DEVICE_TREE config HAS_EX_TABLE bool +config HAS_FAST_MULTIPLY + bool + config MEM_ACCESS_ALWAYS_ON bool --- a/xen/include/xen/bitops.h +++ b/xen/include/xen/bitops.h @@ -153,41 +153,54 @@ static __inline__ int get_count_order(un static inline unsigned int generic_hweight32(unsigned int w) { - unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555); - res = (res & 0x33333333) + ((res >> 2) & 0x33333333); - res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F); - res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF); - return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF); + w -= (w >> 1) & 0x55555555; + w = (w & 0x33333333) + ((w >> 2) & 0x33333333); + w = (w + (w >> 4)) & 0x0f0f0f0f; + +#ifdef CONFIG_HAS_FAST_MULTIPLY + return (w * 0x01010101) >> 24; +#else + w += w >> 8; + + return (w + (w >> 16)) & 0xff; +#endif } static inline unsigned int generic_hweight16(unsigned int w) { - unsigned int res = (w & 0x5555) + ((w >> 1) & 0x5555); - res = (res & 0x3333) + ((res >> 2) & 0x3333); - res = (res & 0x0F0F) + ((res >> 4) & 0x0F0F); - return (res & 0x00FF) + ((res >> 8) & 0x00FF); + w -= ((w >> 1) & 0x5555); + w = (w & 0x3333) + ((w >> 2) & 0x3333); + w = (w + (w >> 4)) & 0x0f0f; + + return (w + (w >> 8)) & 0xff; } static inline unsigned int generic_hweight8(unsigned int w) { - unsigned int res = (w & 0x55) + ((w >> 1) & 0x55); - res = (res & 0x33) + ((res >> 2) & 0x33); - return (res & 0x0F) + ((res >> 4) & 0x0F); + w -= ((w >> 1) & 0x55); + w = (w & 0x33) + ((w >> 2) & 0x33); + + return (w + (w >> 4)) & 0x0f; } -static inline unsigned long generic_hweight64(__u64 w) +static inline unsigned int generic_hweight64(uint64_t w) { #if BITS_PER_LONG < 64 return generic_hweight32((unsigned int)(w >> 32)) + generic_hweight32((unsigned int)w); #else - u64 res; - res = (w & 0x5555555555555555ul) + ((w >> 1) & 0x5555555555555555ul); - res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul); - res = (res & 0x0F0F0F0F0F0F0F0Ful) + ((res >> 4) & 0x0F0F0F0F0F0F0F0Ful); - res = (res & 0x00FF00FF00FF00FFul) + ((res >> 8) & 0x00FF00FF00FF00FFul); - res = (res & 0x0000FFFF0000FFFFul) + ((res >> 16) & 0x0000FFFF0000FFFFul); - return (res & 0x00000000FFFFFFFFul) + ((res >> 32) & 0x00000000FFFFFFFFul); + w -= (w >> 1) & 0x5555555555555555ul; + w = (w & 0x3333333333333333ul) + ((w >> 2) & 0x3333333333333333ul); + w = (w + (w >> 4)) & 0x0f0f0f0f0f0f0f0ful; + +# ifdef CONFIG_HAS_FAST_MULTIPLY + return (w * 0x0101010101010101ul) >> 56; +# else + w += w >> 8; + w += w >> 16; + + return (w + (w >> 32)) & 0xFF; +# endif #endif }
Algorithmically this gets us in line with current Linux, where the same change did happen about 13 years ago. See in particular Linux commits f9b4192923 ("bitops: hweight() speedup") and 0136611c62 ("optimize hweight64 for x86_64"). Kconfig changes for actually setting HAVE_FAST_MULTIPLY will follow. Take the opportunity and change generic_hweight64()'s return type to unsigned int. Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com> Signed-off-by: Jan Beulich <jbeulich@suse.com>