diff mbox series

[v2] x86: use POPCNT for hweight<N>() when available

Message ID 55a4a24d-7fac-527c-6bcf-8d689136bac2@suse.com (mailing list archive)
State New, archived
Headers show
Series [v2] x86: use POPCNT for hweight<N>() when available | expand

Commit Message

Jan Beulich July 15, 2019, 2:39 p.m. UTC
This is faster than using the software implementation, and the insn is
available on all half-way recent hardware. Therefore convert
generic_hweight<N>() to out-of-line functions (without affecting Arm)
and use alternatives patching to replace the function calls.

Note that the approach doesn#t work for clang, due to it not recognizing
-ffixed-*.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Also suppress UB sanitizer instrumentation. Reduce macroization in
     hweight.c. Exclude clang builds.
---
Note: Using "g" instead of "X" as the dummy constraint in hweight64()
       and hweight32(), other than expected, produces slightly better
       code with gcc 8.
x86: use POPCNT for hweight<N>() when available

This is faster than using the software implementation, and the insn is
available on all half-way recent hardware. Therefore convert
generic_hweight<N>() to out-of-line functions (without affecting Arm)
and use alternatives patching to replace the function calls.

Note that the approach doesn#t work for clang, due to it not recognizing
-ffixed-*.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Also suppress UB sanitizer instrumentation. Reduce macroization in
    hweight.c. Exclude clang builds.
---
Note: Using "g" instead of "X" as the dummy constraint in hweight64()
      and hweight32(), other than expected, produces slightly better
      code with gcc 8.

--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -31,6 +31,10 @@ obj-y += emul-i8254.o
 obj-y += extable.o
 obj-y += flushtlb.o
 obj-$(CONFIG_CRASH_DEBUG) += gdbstub.o
+# clang doesn't appear to know of -ffixed-*
+hweight-$(gcc) := hweight.o
+hweight-$(clang) :=
+obj-y += $(hweight-y)
 obj-y += hypercall.o
 obj-y += i387.o
 obj-y += i8259.o
@@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c
 efi/mkreloc: efi/mkreloc.c
 	$(HOSTCC) $(HOSTCFLAGS) -g -o $@ $<
 
+nocov-y += hweight.o
+noubsan-y += hweight.o
+hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg))
+
 .PHONY: clean
 clean::
 	rm -f asm-offsets.s *.lds boot/*.o boot/*~ boot/core boot/mkelf32
--- /dev/null
+++ b/xen/arch/x86/hweight.c
@@ -0,0 +1,21 @@
+#define generic_hweight64 _hweight64
+#define generic_hweight32 _hweight32
+#define generic_hweight16 _hweight16
+#define generic_hweight8  _hweight8
+
+#include <xen/compiler.h>
+
+#undef inline
+#define inline always_inline
+
+#include <xen/bitops.h>
+
+#undef generic_hweight8
+#undef generic_hweight16
+#undef generic_hweight32
+#undef generic_hweight64
+
+unsigned int generic_hweight8 (unsigned int x) { return _hweight8 (x); }
+unsigned int generic_hweight16(unsigned int x) { return _hweight16(x); }
+unsigned int generic_hweight32(unsigned int x) { return _hweight32(x); }
+unsigned int generic_hweight64(uint64_t x)     { return _hweight64(x); }
--- a/xen/include/asm-x86/bitops.h
+++ b/xen/include/asm-x86/bitops.h
@@ -475,9 +475,36 @@ static inline int fls(unsigned int x)
  *
  * The Hamming Weight of a number is the total number of bits set in it.
  */
+#ifndef __clang__
+/* POPCNT encodings with %{r,e}di input and %{r,e}ax output: */
+#define POPCNT_64 ".byte 0xF3, 0x48, 0x0F, 0xB8, 0xC7"
+#define POPCNT_32 ".byte 0xF3, 0x0F, 0xB8, 0xC7"
+
+#define hweight_(n, x, insn, setup, cout, cin) ({                         \
+    unsigned int res_;                                                    \
+    /*                                                                    \
+     * For the function call the POPCNT input register needs to be marked \
+     * modified as well. Set up a local variable of appropriate type      \
+     * for this purpose.                                                  \
+     */                                                                   \
+    typeof((uint##n##_t)(x) + 0U) val_ = (x);                             \
+    alternative_io(setup "; call generic_hweight" #n,                     \
+                   insn, X86_FEATURE_POPCNT,                              \
+                   ASM_OUTPUT2([res] "=a" (res_), [val] cout (val_)),     \
+                   [src] cin (val_));                                     \
+    res_;                                                                 \
+})
+#define hweight64(x) hweight_(64, x, POPCNT_64, "", "+D", "g")
+#define hweight32(x) hweight_(32, x, POPCNT_32, "", "+D", "g")
+#define hweight16(x) hweight_(16, x, "movzwl %w[src], %[val]; " POPCNT_32, \
+                              "mov %[src], %[val]", "=&D", "rm")
+#define hweight8(x)  hweight_( 8, x, "movzbl %b[src], %[val]; " POPCNT_32, \
+                              "mov %[src], %[val]", "=&D", "rm")
+#else
 #define hweight64(x) generic_hweight64(x)
 #define hweight32(x) generic_hweight32(x)
 #define hweight16(x) generic_hweight16(x)
 #define hweight8(x) generic_hweight8(x)
+#endif
 
 #endif /* _X86_BITOPS_H */

Comments

Roger Pau Monné May 14, 2020, 2:05 p.m. UTC | #1
On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
> This is faster than using the software implementation, and the insn is
> available on all half-way recent hardware. Therefore convert
> generic_hweight<N>() to out-of-line functions (without affecting Arm)
> and use alternatives patching to replace the function calls.
> 
> Note that the approach doesn#t work for clang, due to it not recognizing
> -ffixed-*.
> 
> Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v2: Also suppress UB sanitizer instrumentation. Reduce macroization in
>      hweight.c. Exclude clang builds.
> ---
> Note: Using "g" instead of "X" as the dummy constraint in hweight64()
>        and hweight32(), other than expected, produces slightly better
>        code with gcc 8.
> 
> --- a/xen/arch/x86/Makefile
> +++ b/xen/arch/x86/Makefile
> @@ -31,6 +31,10 @@ obj-y += emul-i8254.o
>   obj-y += extable.o
>   obj-y += flushtlb.o
>   obj-$(CONFIG_CRASH_DEBUG) += gdbstub.o
> +# clang doesn't appear to know of -ffixed-*
> +hweight-$(gcc) := hweight.o
> +hweight-$(clang) :=
> +obj-y += $(hweight-y)
>   obj-y += hypercall.o
>   obj-y += i387.o
>   obj-y += i8259.o
> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c
>   efi/mkreloc: efi/mkreloc.c
>   	$(HOSTCC) $(HOSTCFLAGS) -g -o $@ $<
>   
> +nocov-y += hweight.o
> +noubsan-y += hweight.o
> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg))

Why not use clobbers in the asm to list the scratch registers? Is it
that much expensive?

> +
>   .PHONY: clean
>   clean::
>   	rm -f asm-offsets.s *.lds boot/*.o boot/*~ boot/core boot/mkelf32
> --- /dev/null
> +++ b/xen/arch/x86/hweight.c
> @@ -0,0 +1,21 @@
> +#define generic_hweight64 _hweight64
> +#define generic_hweight32 _hweight32
> +#define generic_hweight16 _hweight16
> +#define generic_hweight8  _hweight8
> +
> +#include <xen/compiler.h>
> +
> +#undef inline
> +#define inline always_inline
> +
> +#include <xen/bitops.h>
> +
> +#undef generic_hweight8
> +#undef generic_hweight16
> +#undef generic_hweight32
> +#undef generic_hweight64
> +
> +unsigned int generic_hweight8 (unsigned int x) { return _hweight8 (x); }
> +unsigned int generic_hweight16(unsigned int x) { return _hweight16(x); }
> +unsigned int generic_hweight32(unsigned int x) { return _hweight32(x); }
> +unsigned int generic_hweight64(uint64_t x)     { return _hweight64(x); }
> --- a/xen/include/asm-x86/bitops.h
> +++ b/xen/include/asm-x86/bitops.h
> @@ -475,9 +475,36 @@ static inline int fls(unsigned int x)
>    *
>    * The Hamming Weight of a number is the total number of bits set in it.
>    */
> +#ifndef __clang__
> +/* POPCNT encodings with %{r,e}di input and %{r,e}ax output: */
> +#define POPCNT_64 ".byte 0xF3, 0x48, 0x0F, 0xB8, 0xC7"
> +#define POPCNT_32 ".byte 0xF3, 0x0F, 0xB8, 0xC7"
> +
> +#define hweight_(n, x, insn, setup, cout, cin) ({                         \
> +    unsigned int res_;                                                    \
> +    /*                                                                    \
> +     * For the function call the POPCNT input register needs to be marked \
> +     * modified as well. Set up a local variable of appropriate type      \
> +     * for this purpose.                                                  \
> +     */                                                                   \
> +    typeof((uint##n##_t)(x) + 0U) val_ = (x);                             \
> +    alternative_io(setup "; call generic_hweight" #n,                     \
> +                   insn, X86_FEATURE_POPCNT,                              \
> +                   ASM_OUTPUT2([res] "=a" (res_), [val] cout (val_)),     \
> +                   [src] cin (val_));                                     \
> +    res_;                                                                 \
> +})
> +#define hweight64(x) hweight_(64, x, POPCNT_64, "", "+D", "g")
> +#define hweight32(x) hweight_(32, x, POPCNT_32, "", "+D", "g")
> +#define hweight16(x) hweight_(16, x, "movzwl %w[src], %[val]; " POPCNT_32, \
> +                              "mov %[src], %[val]", "=&D", "rm")
> +#define hweight8(x)  hweight_( 8, x, "movzbl %b[src], %[val]; " POPCNT_32, \
> +                              "mov %[src], %[val]", "=&D", "rm")

Why not just convert types < 32bits into uint32_t and avoid the asm
prefix? You are already converting them in hweight_ to an uintX_t.

That would made the asm simpler as you would then only need to make
sure the input is in %rdi and the output is fetched from %rax?

Thanks, Roger.
Jan Beulich May 20, 2020, 8:31 a.m. UTC | #2
On 14.05.2020 16:05, Roger Pau Monné wrote:
> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
>> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c
>>   efi/mkreloc: efi/mkreloc.c
>>   	$(HOSTCC) $(HOSTCFLAGS) -g -o $@ $<
>>   
>> +nocov-y += hweight.o
>> +noubsan-y += hweight.o
>> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg))
> 
> Why not use clobbers in the asm to list the scratch registers? Is it
> that much expensive?

The goal is to disturb the call sites as little as possible. There's
no point avoiding the scratch registers when no call is made (i.e.
when the POPCNT insn can be used). Taking away from the compiler 7
out of 15 registers (that it can hold live data in) seems quite a
lot to me.

>> --- a/xen/include/asm-x86/bitops.h
>> +++ b/xen/include/asm-x86/bitops.h
>> @@ -475,9 +475,36 @@ static inline int fls(unsigned int x)
>>    *
>>    * The Hamming Weight of a number is the total number of bits set in it.
>>    */
>> +#ifndef __clang__
>> +/* POPCNT encodings with %{r,e}di input and %{r,e}ax output: */
>> +#define POPCNT_64 ".byte 0xF3, 0x48, 0x0F, 0xB8, 0xC7"
>> +#define POPCNT_32 ".byte 0xF3, 0x0F, 0xB8, 0xC7"
>> +
>> +#define hweight_(n, x, insn, setup, cout, cin) ({                         \
>> +    unsigned int res_;                                                    \
>> +    /*                                                                    \
>> +     * For the function call the POPCNT input register needs to be marked \
>> +     * modified as well. Set up a local variable of appropriate type      \
>> +     * for this purpose.                                                  \
>> +     */                                                                   \
>> +    typeof((uint##n##_t)(x) + 0U) val_ = (x);                             \
>> +    alternative_io(setup "; call generic_hweight" #n,                     \
>> +                   insn, X86_FEATURE_POPCNT,                              \
>> +                   ASM_OUTPUT2([res] "=a" (res_), [val] cout (val_)),     \
>> +                   [src] cin (val_));                                     \
>> +    res_;                                                                 \
>> +})
>> +#define hweight64(x) hweight_(64, x, POPCNT_64, "", "+D", "g")
>> +#define hweight32(x) hweight_(32, x, POPCNT_32, "", "+D", "g")
>> +#define hweight16(x) hweight_(16, x, "movzwl %w[src], %[val]; " POPCNT_32, \
>> +                              "mov %[src], %[val]", "=&D", "rm")
>> +#define hweight8(x)  hweight_( 8, x, "movzbl %b[src], %[val]; " POPCNT_32, \
>> +                              "mov %[src], %[val]", "=&D", "rm")
> 
> Why not just convert types < 32bits into uint32_t and avoid the asm
> prefix? You are already converting them in hweight_ to an uintX_t.

I don't think I do - there's a conversion to the promoted type
when adding in an unsigned int value (which is there to retain
uint64_t for the 64-bit case, while using unsigned int for all
smaller sizes, as per the parameter types of generic_hweight*()).

> That would made the asm simpler as you would then only need to make
> sure the input is in %rdi and the output is fetched from %rax?

That's an option, yes, but I again wanted to limit the impact to
generated code (including code size) as much as possible.
generic_hweight{8,16}() take unsigned int arguments, not
uint{8,16}_t ones. Hence at the call site (when a function call
is needed) no zero extension is necessary. Therefore the MOVZ
above is to (mainly) overlay the MOV during patching, while the
POPCNT is to (mainly) overlay the CALL.

If the simpler asm()-s were considered more important than the
quality of the generated code, I think we could simply have

#define hweight16(x) hweight32((uint16_t)(x))
#define hweight8(x)  hweight32(( uint8_t)(x))

Jan
Roger Pau Monné May 20, 2020, 9:31 a.m. UTC | #3
On Wed, May 20, 2020 at 10:31:38AM +0200, Jan Beulich wrote:
> On 14.05.2020 16:05, Roger Pau Monné wrote:
> > On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
> >> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c
> >>   efi/mkreloc: efi/mkreloc.c
> >>   	$(HOSTCC) $(HOSTCFLAGS) -g -o $@ $<
> >>   
> >> +nocov-y += hweight.o
> >> +noubsan-y += hweight.o
> >> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg))
> > 
> > Why not use clobbers in the asm to list the scratch registers? Is it
> > that much expensive?
> 
> The goal is to disturb the call sites as little as possible. There's
> no point avoiding the scratch registers when no call is made (i.e.
> when the POPCNT insn can be used). Taking away from the compiler 7
> out of 15 registers (that it can hold live data in) seems quite a
> lot to me.

IMO using -ffixed-reg for all those registers is even worse, as that
prevents the generated code in hweight from using any of those, thus
greatly limiting the amount of registers and likely making the
generated code rely heavily on pushing an popping from the stack?

This also has the side effect to limiting the usage of popcnt to gcc,
which IMO is also not ideal.

I also wondered, since the in-place asm before patching is a call
instruction, wouldn't inline asm at build time already assume that the
scratch registers are clobbered?

> >> --- a/xen/include/asm-x86/bitops.h
> >> +++ b/xen/include/asm-x86/bitops.h
> >> @@ -475,9 +475,36 @@ static inline int fls(unsigned int x)
> >>    *
> >>    * The Hamming Weight of a number is the total number of bits set in it.
> >>    */
> >> +#ifndef __clang__
> >> +/* POPCNT encodings with %{r,e}di input and %{r,e}ax output: */
> >> +#define POPCNT_64 ".byte 0xF3, 0x48, 0x0F, 0xB8, 0xC7"
> >> +#define POPCNT_32 ".byte 0xF3, 0x0F, 0xB8, 0xC7"
> >> +
> >> +#define hweight_(n, x, insn, setup, cout, cin) ({                         \
> >> +    unsigned int res_;                                                    \
> >> +    /*                                                                    \
> >> +     * For the function call the POPCNT input register needs to be marked \
> >> +     * modified as well. Set up a local variable of appropriate type      \
> >> +     * for this purpose.                                                  \
> >> +     */                                                                   \
> >> +    typeof((uint##n##_t)(x) + 0U) val_ = (x);                             \
> >> +    alternative_io(setup "; call generic_hweight" #n,                     \
> >> +                   insn, X86_FEATURE_POPCNT,                              \
> >> +                   ASM_OUTPUT2([res] "=a" (res_), [val] cout (val_)),     \
> >> +                   [src] cin (val_));                                     \
> >> +    res_;                                                                 \
> >> +})
> >> +#define hweight64(x) hweight_(64, x, POPCNT_64, "", "+D", "g")
> >> +#define hweight32(x) hweight_(32, x, POPCNT_32, "", "+D", "g")
> >> +#define hweight16(x) hweight_(16, x, "movzwl %w[src], %[val]; " POPCNT_32, \
> >> +                              "mov %[src], %[val]", "=&D", "rm")
> >> +#define hweight8(x)  hweight_( 8, x, "movzbl %b[src], %[val]; " POPCNT_32, \
> >> +                              "mov %[src], %[val]", "=&D", "rm")
> > 
> > Why not just convert types < 32bits into uint32_t and avoid the asm
> > prefix? You are already converting them in hweight_ to an uintX_t.
> 
> I don't think I do - there's a conversion to the promoted type
> when adding in an unsigned int value (which is there to retain
> uint64_t for the 64-bit case, while using unsigned int for all
> smaller sizes, as per the parameter types of generic_hweight*()).
> 
> > That would made the asm simpler as you would then only need to make
> > sure the input is in %rdi and the output is fetched from %rax?
> 
> That's an option, yes, but I again wanted to limit the impact to
> generated code (including code size) as much as possible.
> generic_hweight{8,16}() take unsigned int arguments, not
> uint{8,16}_t ones. Hence at the call site (when a function call
> is needed) no zero extension is necessary. Therefore the MOVZ
> above is to (mainly) overlay the MOV during patching, while the
> POPCNT is to (mainly) overlay the CALL.
> 
> If the simpler asm()-s were considered more important than the
> quality of the generated code, I think we could simply have
> 
> #define hweight16(x) hweight32((uint16_t)(x))
> #define hweight8(x)  hweight32(( uint8_t)(x))

I would definitely prefer simpler asm for sure, so getting rid of the
prefix would be a clear +1 from me. Unless the prefixed version has a
measurable performance impact during normal operation.

Thanks, Roger.
Jan Beulich May 20, 2020, 10:17 a.m. UTC | #4
On 20.05.2020 11:31, Roger Pau Monné wrote:
> On Wed, May 20, 2020 at 10:31:38AM +0200, Jan Beulich wrote:
>> On 14.05.2020 16:05, Roger Pau Monné wrote:
>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
>>>> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c
>>>>   efi/mkreloc: efi/mkreloc.c
>>>>   	$(HOSTCC) $(HOSTCFLAGS) -g -o $@ $<
>>>>   
>>>> +nocov-y += hweight.o
>>>> +noubsan-y += hweight.o
>>>> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg))
>>>
>>> Why not use clobbers in the asm to list the scratch registers? Is it
>>> that much expensive?
>>
>> The goal is to disturb the call sites as little as possible. There's
>> no point avoiding the scratch registers when no call is made (i.e.
>> when the POPCNT insn can be used). Taking away from the compiler 7
>> out of 15 registers (that it can hold live data in) seems quite a
>> lot to me.
> 
> IMO using -ffixed-reg for all those registers is even worse, as that
> prevents the generated code in hweight from using any of those, thus
> greatly limiting the amount of registers and likely making the
> generated code rely heavily on pushing an popping from the stack?

Okay, that's the other side of the idea behind all this: Virtually no
hardware we run on will lack POPCNT support, hence the quality of
these fallback routines matters only the very old hardware, where we
likely don't perform optimally already anyway.

> This also has the side effect to limiting the usage of popcnt to gcc,
> which IMO is also not ideal.

Agreed. I don't know enough about clang to be able to think of
possible alternatives. In any event there's no change to current
behavior for hypervisors built with clang.

> I also wondered, since the in-place asm before patching is a call
> instruction, wouldn't inline asm at build time already assume that the
> scratch registers are clobbered?

That would imply the compiler peeks into the string literal of the
asm(). At least gcc doesn't, and even if it did it couldn't infer an
ABI from seeing a CALL insn.

Jan
Roger Pau Monné May 20, 2020, 10:28 a.m. UTC | #5
On Wed, May 20, 2020 at 12:17:15PM +0200, Jan Beulich wrote:
> On 20.05.2020 11:31, Roger Pau Monné wrote:
> > On Wed, May 20, 2020 at 10:31:38AM +0200, Jan Beulich wrote:
> >> On 14.05.2020 16:05, Roger Pau Monné wrote:
> >>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
> >>>> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c
> >>>>   efi/mkreloc: efi/mkreloc.c
> >>>>   	$(HOSTCC) $(HOSTCFLAGS) -g -o $@ $<
> >>>>   
> >>>> +nocov-y += hweight.o
> >>>> +noubsan-y += hweight.o
> >>>> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg))
> >>>
> >>> Why not use clobbers in the asm to list the scratch registers? Is it
> >>> that much expensive?
> >>
> >> The goal is to disturb the call sites as little as possible. There's
> >> no point avoiding the scratch registers when no call is made (i.e.
> >> when the POPCNT insn can be used). Taking away from the compiler 7
> >> out of 15 registers (that it can hold live data in) seems quite a
> >> lot to me.
> > 
> > IMO using -ffixed-reg for all those registers is even worse, as that
> > prevents the generated code in hweight from using any of those, thus
> > greatly limiting the amount of registers and likely making the
> > generated code rely heavily on pushing an popping from the stack?
> 
> Okay, that's the other side of the idea behind all this: Virtually no
> hardware we run on will lack POPCNT support, hence the quality of
> these fallback routines matters only the very old hardware, where we
> likely don't perform optimally already anyway.
> 
> > This also has the side effect to limiting the usage of popcnt to gcc,
> > which IMO is also not ideal.
> 
> Agreed. I don't know enough about clang to be able to think of
> possible alternatives. In any event there's no change to current
> behavior for hypervisors built with clang.
> 
> > I also wondered, since the in-place asm before patching is a call
> > instruction, wouldn't inline asm at build time already assume that the
> > scratch registers are clobbered?
> 
> That would imply the compiler peeks into the string literal of the
> asm(). At least gcc doesn't, and even if it did it couldn't infer an
> ABI from seeing a CALL insn.

Please bear with me, but then I don't understand what Linux is doing
in arch/x86/include/asm/arch_hweight.h. I see no clobbers there,
neither it seems like the __sw_hweight{32/64} functions are built
without the usage of the scratch registers.

Roger.
Jan Beulich May 20, 2020, 10:57 a.m. UTC | #6
On 20.05.2020 12:28, Roger Pau Monné wrote:
> On Wed, May 20, 2020 at 12:17:15PM +0200, Jan Beulich wrote:
>> On 20.05.2020 11:31, Roger Pau Monné wrote:
>>> On Wed, May 20, 2020 at 10:31:38AM +0200, Jan Beulich wrote:
>>>> On 14.05.2020 16:05, Roger Pau Monné wrote:
>>>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
>>>>>> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c
>>>>>>   efi/mkreloc: efi/mkreloc.c
>>>>>>   	$(HOSTCC) $(HOSTCFLAGS) -g -o $@ $<
>>>>>>   
>>>>>> +nocov-y += hweight.o
>>>>>> +noubsan-y += hweight.o
>>>>>> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg))
>>>>>
>>>>> Why not use clobbers in the asm to list the scratch registers? Is it
>>>>> that much expensive?
>>>>
>>>> The goal is to disturb the call sites as little as possible. There's
>>>> no point avoiding the scratch registers when no call is made (i.e.
>>>> when the POPCNT insn can be used). Taking away from the compiler 7
>>>> out of 15 registers (that it can hold live data in) seems quite a
>>>> lot to me.
>>>
>>> IMO using -ffixed-reg for all those registers is even worse, as that
>>> prevents the generated code in hweight from using any of those, thus
>>> greatly limiting the amount of registers and likely making the
>>> generated code rely heavily on pushing an popping from the stack?
>>
>> Okay, that's the other side of the idea behind all this: Virtually no
>> hardware we run on will lack POPCNT support, hence the quality of
>> these fallback routines matters only the very old hardware, where we
>> likely don't perform optimally already anyway.
>>
>>> This also has the side effect to limiting the usage of popcnt to gcc,
>>> which IMO is also not ideal.
>>
>> Agreed. I don't know enough about clang to be able to think of
>> possible alternatives. In any event there's no change to current
>> behavior for hypervisors built with clang.
>>
>>> I also wondered, since the in-place asm before patching is a call
>>> instruction, wouldn't inline asm at build time already assume that the
>>> scratch registers are clobbered?
>>
>> That would imply the compiler peeks into the string literal of the
>> asm(). At least gcc doesn't, and even if it did it couldn't infer an
>> ABI from seeing a CALL insn.
> 
> Please bear with me, but then I don't understand what Linux is doing
> in arch/x86/include/asm/arch_hweight.h. I see no clobbers there,
> neither it seems like the __sw_hweight{32/64} functions are built
> without the usage of the scratch registers.

__sw_hweight{32,64} are implemented in assembly, avoiding most
scratch registers while pushing/popping the ones which do get
altered.

Jan
Roger Pau Monné May 20, 2020, 11:43 a.m. UTC | #7
On Wed, May 20, 2020 at 12:57:27PM +0200, Jan Beulich wrote:
> On 20.05.2020 12:28, Roger Pau Monné wrote:
> > On Wed, May 20, 2020 at 12:17:15PM +0200, Jan Beulich wrote:
> >> On 20.05.2020 11:31, Roger Pau Monné wrote:
> >>> On Wed, May 20, 2020 at 10:31:38AM +0200, Jan Beulich wrote:
> >>>> On 14.05.2020 16:05, Roger Pau Monné wrote:
> >>>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
> >>>>>> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c
> >>>>>>   efi/mkreloc: efi/mkreloc.c
> >>>>>>   	$(HOSTCC) $(HOSTCFLAGS) -g -o $@ $<
> >>>>>>   
> >>>>>> +nocov-y += hweight.o
> >>>>>> +noubsan-y += hweight.o
> >>>>>> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg))
> >>>>>
> >>>>> Why not use clobbers in the asm to list the scratch registers? Is it
> >>>>> that much expensive?
> >>>>
> >>>> The goal is to disturb the call sites as little as possible. There's
> >>>> no point avoiding the scratch registers when no call is made (i.e.
> >>>> when the POPCNT insn can be used). Taking away from the compiler 7
> >>>> out of 15 registers (that it can hold live data in) seems quite a
> >>>> lot to me.
> >>>
> >>> IMO using -ffixed-reg for all those registers is even worse, as that
> >>> prevents the generated code in hweight from using any of those, thus
> >>> greatly limiting the amount of registers and likely making the
> >>> generated code rely heavily on pushing an popping from the stack?
> >>
> >> Okay, that's the other side of the idea behind all this: Virtually no
> >> hardware we run on will lack POPCNT support, hence the quality of
> >> these fallback routines matters only the very old hardware, where we
> >> likely don't perform optimally already anyway.
> >>
> >>> This also has the side effect to limiting the usage of popcnt to gcc,
> >>> which IMO is also not ideal.
> >>
> >> Agreed. I don't know enough about clang to be able to think of
> >> possible alternatives. In any event there's no change to current
> >> behavior for hypervisors built with clang.
> >>
> >>> I also wondered, since the in-place asm before patching is a call
> >>> instruction, wouldn't inline asm at build time already assume that the
> >>> scratch registers are clobbered?
> >>
> >> That would imply the compiler peeks into the string literal of the
> >> asm(). At least gcc doesn't, and even if it did it couldn't infer an
> >> ABI from seeing a CALL insn.
> > 
> > Please bear with me, but then I don't understand what Linux is doing
> > in arch/x86/include/asm/arch_hweight.h. I see no clobbers there,
> > neither it seems like the __sw_hweight{32/64} functions are built
> > without the usage of the scratch registers.
> 
> __sw_hweight{32,64} are implemented in assembly, avoiding most
> scratch registers while pushing/popping the ones which do get
> altered.

Oh right, I was looking at lib/hweight.c instead of the arch one.

Would you agree to use the no_caller_saved_registers attribute (which
is available AFAICT for both gcc and clang) for generic_hweightXX and
then remove the asm prefix code in favour of the defines for
hweight{8/16}?

I think that would make it easier to read.

Thanks, Roger.
Jan Beulich May 20, 2020, 1:12 p.m. UTC | #8
On 20.05.2020 13:43, Roger Pau Monné wrote:
> On Wed, May 20, 2020 at 12:57:27PM +0200, Jan Beulich wrote:
>> On 20.05.2020 12:28, Roger Pau Monné wrote:
>>> On Wed, May 20, 2020 at 12:17:15PM +0200, Jan Beulich wrote:
>>>> On 20.05.2020 11:31, Roger Pau Monné wrote:
>>>>> On Wed, May 20, 2020 at 10:31:38AM +0200, Jan Beulich wrote:
>>>>>> On 14.05.2020 16:05, Roger Pau Monné wrote:
>>>>>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
>>>>>>>> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c
>>>>>>>>   efi/mkreloc: efi/mkreloc.c
>>>>>>>>   	$(HOSTCC) $(HOSTCFLAGS) -g -o $@ $<
>>>>>>>>   
>>>>>>>> +nocov-y += hweight.o
>>>>>>>> +noubsan-y += hweight.o
>>>>>>>> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg))
>>>>>>>
>>>>>>> Why not use clobbers in the asm to list the scratch registers? Is it
>>>>>>> that much expensive?
>>>>>>
>>>>>> The goal is to disturb the call sites as little as possible. There's
>>>>>> no point avoiding the scratch registers when no call is made (i.e.
>>>>>> when the POPCNT insn can be used). Taking away from the compiler 7
>>>>>> out of 15 registers (that it can hold live data in) seems quite a
>>>>>> lot to me.
>>>>>
>>>>> IMO using -ffixed-reg for all those registers is even worse, as that
>>>>> prevents the generated code in hweight from using any of those, thus
>>>>> greatly limiting the amount of registers and likely making the
>>>>> generated code rely heavily on pushing an popping from the stack?
>>>>
>>>> Okay, that's the other side of the idea behind all this: Virtually no
>>>> hardware we run on will lack POPCNT support, hence the quality of
>>>> these fallback routines matters only the very old hardware, where we
>>>> likely don't perform optimally already anyway.
>>>>
>>>>> This also has the side effect to limiting the usage of popcnt to gcc,
>>>>> which IMO is also not ideal.
>>>>
>>>> Agreed. I don't know enough about clang to be able to think of
>>>> possible alternatives. In any event there's no change to current
>>>> behavior for hypervisors built with clang.
>>>>
>>>>> I also wondered, since the in-place asm before patching is a call
>>>>> instruction, wouldn't inline asm at build time already assume that the
>>>>> scratch registers are clobbered?
>>>>
>>>> That would imply the compiler peeks into the string literal of the
>>>> asm(). At least gcc doesn't, and even if it did it couldn't infer an
>>>> ABI from seeing a CALL insn.
>>>
>>> Please bear with me, but then I don't understand what Linux is doing
>>> in arch/x86/include/asm/arch_hweight.h. I see no clobbers there,
>>> neither it seems like the __sw_hweight{32/64} functions are built
>>> without the usage of the scratch registers.
>>
>> __sw_hweight{32,64} are implemented in assembly, avoiding most
>> scratch registers while pushing/popping the ones which do get
>> altered.
> 
> Oh right, I was looking at lib/hweight.c instead of the arch one.
> 
> Would you agree to use the no_caller_saved_registers attribute (which
> is available AFAICT for both gcc and clang) for generic_hweightXX and
> then remove the asm prefix code in favour of the defines for
> hweight{8/16}?

At least for gcc no_caller_saved_registers isn't old enough to be
used unconditionally (nor is its companion -mgeneral-regs-only).
If you tell me it's fine to use unconditionally with clang, then
I can see about making this the preferred variant, with the
present one as a fallback.

Jan
Roger Pau Monné May 20, 2020, 5:18 p.m. UTC | #9
On Wed, May 20, 2020 at 03:12:25PM +0200, Jan Beulich wrote:
> On 20.05.2020 13:43, Roger Pau Monné wrote:
> > On Wed, May 20, 2020 at 12:57:27PM +0200, Jan Beulich wrote:
> >> On 20.05.2020 12:28, Roger Pau Monné wrote:
> >>> On Wed, May 20, 2020 at 12:17:15PM +0200, Jan Beulich wrote:
> >>>> On 20.05.2020 11:31, Roger Pau Monné wrote:
> >>>>> On Wed, May 20, 2020 at 10:31:38AM +0200, Jan Beulich wrote:
> >>>>>> On 14.05.2020 16:05, Roger Pau Monné wrote:
> >>>>>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
> >>>>>>>> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c
> >>>>>>>>   efi/mkreloc: efi/mkreloc.c
> >>>>>>>>   	$(HOSTCC) $(HOSTCFLAGS) -g -o $@ $<
> >>>>>>>>   
> >>>>>>>> +nocov-y += hweight.o
> >>>>>>>> +noubsan-y += hweight.o
> >>>>>>>> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg))
> >>>>>>>
> >>>>>>> Why not use clobbers in the asm to list the scratch registers? Is it
> >>>>>>> that much expensive?
> >>>>>>
> >>>>>> The goal is to disturb the call sites as little as possible. There's
> >>>>>> no point avoiding the scratch registers when no call is made (i.e.
> >>>>>> when the POPCNT insn can be used). Taking away from the compiler 7
> >>>>>> out of 15 registers (that it can hold live data in) seems quite a
> >>>>>> lot to me.
> >>>>>
> >>>>> IMO using -ffixed-reg for all those registers is even worse, as that
> >>>>> prevents the generated code in hweight from using any of those, thus
> >>>>> greatly limiting the amount of registers and likely making the
> >>>>> generated code rely heavily on pushing an popping from the stack?
> >>>>
> >>>> Okay, that's the other side of the idea behind all this: Virtually no
> >>>> hardware we run on will lack POPCNT support, hence the quality of
> >>>> these fallback routines matters only the very old hardware, where we
> >>>> likely don't perform optimally already anyway.
> >>>>
> >>>>> This also has the side effect to limiting the usage of popcnt to gcc,
> >>>>> which IMO is also not ideal.
> >>>>
> >>>> Agreed. I don't know enough about clang to be able to think of
> >>>> possible alternatives. In any event there's no change to current
> >>>> behavior for hypervisors built with clang.
> >>>>
> >>>>> I also wondered, since the in-place asm before patching is a call
> >>>>> instruction, wouldn't inline asm at build time already assume that the
> >>>>> scratch registers are clobbered?
> >>>>
> >>>> That would imply the compiler peeks into the string literal of the
> >>>> asm(). At least gcc doesn't, and even if it did it couldn't infer an
> >>>> ABI from seeing a CALL insn.
> >>>
> >>> Please bear with me, but then I don't understand what Linux is doing
> >>> in arch/x86/include/asm/arch_hweight.h. I see no clobbers there,
> >>> neither it seems like the __sw_hweight{32/64} functions are built
> >>> without the usage of the scratch registers.
> >>
> >> __sw_hweight{32,64} are implemented in assembly, avoiding most
> >> scratch registers while pushing/popping the ones which do get
> >> altered.
> > 
> > Oh right, I was looking at lib/hweight.c instead of the arch one.
> > 
> > Would you agree to use the no_caller_saved_registers attribute (which
> > is available AFAICT for both gcc and clang) for generic_hweightXX and
> > then remove the asm prefix code in favour of the defines for
> > hweight{8/16}?
> 
> At least for gcc no_caller_saved_registers isn't old enough to be
> used unconditionally (nor is its companion -mgeneral-regs-only).
> If you tell me it's fine to use unconditionally with clang, then
> I can see about making this the preferred variant, with the
> present one as a fallback.

Hm, so my suggestion was bad, no_caller_saved_registers is only
implemented starting clang 5, which is newer than the minimum we
currently require (3.5).

So apart from adding a clobber to the asm instance covering the
scratch registers the only option I can see as viable is using a bunch
of dummy global variables assigned to the registers we need to prevent
the software generic_hweightXX functions from using, but that's ugly
and will likely trigger warnings at least, since I'm not sure the
compiler will find it safe to clobber a function call register with a
global variable. Or adding a prolegue / epilogue to the call
instruction in order to push / pop the relevant registers. None seems
like a very good option IMO.

I also assume that using no_caller_saved_registers when available or
else keeping the current behavior is not an acceptable solution? FWIW,
from a FreeBSD PoV I would be OK with that, as I don't think there are
any supported targets with clang < 5.

Thanks, Roger.
Jan Beulich May 22, 2020, 9:58 a.m. UTC | #10
On 20.05.2020 19:18, Roger Pau Monné wrote:
> I also assume that using no_caller_saved_registers when available or
> else keeping the current behavior is not an acceptable solution? FWIW,
> from a FreeBSD PoV I would be OK with that, as I don't think there are
> any supported targets with clang < 5.

By "current behavior" do you mean what the patch currently
does (matching my earlier response that I may try going this
route) or the unpatched upstream tree?

Jan
Roger Pau Monné May 22, 2020, 10:19 a.m. UTC | #11
On Fri, May 22, 2020 at 11:58:40AM +0200, Jan Beulich wrote:
> On 20.05.2020 19:18, Roger Pau Monné wrote:
> > I also assume that using no_caller_saved_registers when available or
> > else keeping the current behavior is not an acceptable solution? FWIW,
> > from a FreeBSD PoV I would be OK with that, as I don't think there are
> > any supported targets with clang < 5.
> 
> By "current behavior" do you mean what the patch currently
> does (matching my earlier response that I may try going this
> route) or the unpatched upstream tree?

Sorry this wasn't clear. By current I meant what's in the upstream
tree. IMO having both no_caller_saved_registers, the -ffixed-reg stuff
and the current upstream no popcnt implementation is kind of too much
divergence?

My preference would be to add popcnt support only when
no_caller_saved_registers is supported by the compiler, as I think
that's the cleanest solution. This is purely a performance
optimization, so it doesn't seem that bad to me to request a newish
compiler in order to enable it.

Roger.
Roger Pau Monné March 17, 2023, 11:22 a.m. UTC | #12
On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
> This is faster than using the software implementation, and the insn is
> available on all half-way recent hardware. Therefore convert
> generic_hweight<N>() to out-of-line functions (without affecting Arm)
> and use alternatives patching to replace the function calls.
> 
> Note that the approach doesn#t work for clang, due to it not recognizing
> -ffixed-*.

I've been giving this a look, and I wonder if it would be fine to
simply push and pop the scratch registers in the 'call' path of the
alternative, as that won't require any specific compiler option.

Thanks, Roger.
Andrew Cooper March 17, 2023, 12:26 p.m. UTC | #13
On 17/03/2023 11:22 am, Roger Pau Monné wrote:
> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
>> This is faster than using the software implementation, and the insn is
>> available on all half-way recent hardware. Therefore convert
>> generic_hweight<N>() to out-of-line functions (without affecting Arm)
>> and use alternatives patching to replace the function calls.
>>
>> Note that the approach doesn#t work for clang, due to it not recognizing
>> -ffixed-*.
> I've been giving this a look, and I wonder if it would be fine to
> simply push and pop the scratch registers in the 'call' path of the
> alternative, as that won't require any specific compiler option.

It's been a long while, and in that time I've learnt a lot more about
performance, but my root objection to the approach taken here still
stands - it is penalising the common case to optimise some pointless
corner cases.

Yes - on the call path, an extra push/pop pair (or few) to get temp
registers is basically free.


Right now, the generic_hweight() helpers are static inline in
xen/bitops.h and this is nonsense.  The absolute best they should be is
extern inline in our new lib/ and I'll bet that the compilers stop
inlining them there and then.

Given new abi's like x86_64-v2 (which guarantees POPCNT as an available
feature), it would be nice to arrange to use __builtin_popcnt() to let
the compiler optimise to its hearts content, but outside of this case,
it is actively damaging to try and optimise for memory operands or to
inline the 8/16 case.

So, for x86 specifically, we want:

if ( CONFIG_POPCNT )
    __builtin_popcnt()
else
    ALT( "popcnt D -> a",
         "call arch_popcnt$N" )

and we can write arch_popcnt* in x86's lib/ and in assembly, because
these are trivial enough and we do need to be careful with registers/etc.

I'm not sure if a "+D" vs "D" will matter at the top level.  Probably
not, so it might be an easy way to get one tempt register.  Other temp
registers can come from push/pop.


While we're at it, we should split hweight out of bitops and write the
common header in such a way that it defaults to the generic
implementations in lib/, and that will subsume the ARM header and also
make this work on RISC-V for free.

~Andrew
Jan Beulich March 20, 2023, 9:48 a.m. UTC | #14
On 17.03.2023 13:26, Andrew Cooper wrote:
> On 17/03/2023 11:22 am, Roger Pau Monné wrote:
>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
>>> This is faster than using the software implementation, and the insn is
>>> available on all half-way recent hardware. Therefore convert
>>> generic_hweight<N>() to out-of-line functions (without affecting Arm)
>>> and use alternatives patching to replace the function calls.
>>>
>>> Note that the approach doesn#t work for clang, due to it not recognizing
>>> -ffixed-*.
>> I've been giving this a look, and I wonder if it would be fine to
>> simply push and pop the scratch registers in the 'call' path of the
>> alternative, as that won't require any specific compiler option.

Hmm, ...

> It's been a long while, and in that time I've learnt a lot more about
> performance, but my root objection to the approach taken here still
> stands - it is penalising the common case to optimise some pointless
> corner cases.
> 
> Yes - on the call path, an extra push/pop pair (or few) to get temp
> registers is basically free.

... what is "a few"? We'd need to push/pop all call-clobbered registers
except %rax, i.e. a total of eight. I consider this too much. Unless,
as you suggest further down, we wrote the fallback in assembly. Which I
have to admit I'm surprised you propose when we strive to reduce the
amount of assembly we have to maintain.

> Right now, the generic_hweight() helpers are static inline in
> xen/bitops.h and this is nonsense.  The absolute best they should be is
> extern inline in our new lib/ and I'll bet that the compilers stop
> inlining them there and then.

That would be an orthogonal move, wouldn't it? I'm also not really
willing to go as far as calling the present way of it working "nonsense".
I could be talked into doing such a transformation in a separate patch,
but only if it is halfway certain that this won't again be effort
invested just to then face further opposition (other maintainers may not
agree with the movement, as we've seen for other remotely similar
changes to "extern inline").

> Given new abi's like x86_64-v2 (which guarantees POPCNT as an available
> feature), it would be nice to arrange to use __builtin_popcnt() to let
> the compiler optimise to its hearts content, but outside of this case,
> it is actively damaging to try and optimise for memory operands or to
> inline the 8/16 case.
> 
> So, for x86 specifically, we want:
> 
> if ( CONFIG_POPCNT )
>     __builtin_popcnt()
> else
>     ALT( "popcnt D -> a",
>          "call arch_popcnt$N" )
> 
> and we can write arch_popcnt* in x86's lib/ and in assembly, because
> these are trivial enough and we do need to be careful with registers/etc.

How does x86_64-v2 matter here? And how does using __builtin_popcnt()
help, which would use the "popcnt" insn only if we passed -march=popcnt
(or any wider option implying this one) to the compiler?

As to the 8-/16-bit case - I've already accepted to drop that special
casing. The main reason I had it separate was because the generic code
also has them special cased. In any event I would like to make all
agreed upon changes in one go, hence why I didn't submit a new version
yet.

> I'm not sure if a "+D" vs "D" will matter at the top level.  Probably
> not, so it might be an easy way to get one tempt register.  Other temp
> registers can come from push/pop.
> 
> 
> While we're at it, we should split hweight out of bitops and write the
> common header in such a way that it defaults to the generic
> implementations in lib/, and that will subsume the ARM header and also
> make this work on RISC-V for free.

Yet another independent change you're asking for. I've taken note of
both of these separate requests, but without any guarantee (yet) that
I'm going to actually carry them out.

Jan
Roger Pau Monné March 21, 2023, 2:57 p.m. UTC | #15
On Mon, Mar 20, 2023 at 10:48:45AM +0100, Jan Beulich wrote:
> On 17.03.2023 13:26, Andrew Cooper wrote:
> > On 17/03/2023 11:22 am, Roger Pau Monné wrote:
> >> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
> >>> This is faster than using the software implementation, and the insn is
> >>> available on all half-way recent hardware. Therefore convert
> >>> generic_hweight<N>() to out-of-line functions (without affecting Arm)
> >>> and use alternatives patching to replace the function calls.
> >>>
> >>> Note that the approach doesn#t work for clang, due to it not recognizing
> >>> -ffixed-*.
> >> I've been giving this a look, and I wonder if it would be fine to
> >> simply push and pop the scratch registers in the 'call' path of the
> >> alternative, as that won't require any specific compiler option.
> 
> Hmm, ...
> 
> > It's been a long while, and in that time I've learnt a lot more about
> > performance, but my root objection to the approach taken here still
> > stands - it is penalising the common case to optimise some pointless
> > corner cases.
> > 
> > Yes - on the call path, an extra push/pop pair (or few) to get temp
> > registers is basically free.
> 
> ... what is "a few"? We'd need to push/pop all call-clobbered registers
> except %rax, i.e. a total of eight. I consider this too much. Unless,
> as you suggest further down, we wrote the fallback in assembly. Which I
> have to admit I'm surprised you propose when we strive to reduce the
> amount of assembly we have to maintain.

AMD added popcnt in 2007 and Intel in 2008.  While we shouldn't
mandate popcnt support, I think we also shouldn't overly worry about
the non-popcnt path.

Also, how can you assert that the code generated without the scratch
registers being usable won't be worse than the penalty of pushing and
popping such registers on the stack and letting the routines use all
registers freely?

I very much prefer to have a non-optimal non-popcnt path, but have
popcnt support for both gcc and clang, and without requiring any
compiler options.

Thanks, Roger.
Jan Beulich March 21, 2023, 3:35 p.m. UTC | #16
On 21.03.2023 15:57, Roger Pau Monné wrote:
> On Mon, Mar 20, 2023 at 10:48:45AM +0100, Jan Beulich wrote:
>> On 17.03.2023 13:26, Andrew Cooper wrote:
>>> On 17/03/2023 11:22 am, Roger Pau Monné wrote:
>>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
>>>>> This is faster than using the software implementation, and the insn is
>>>>> available on all half-way recent hardware. Therefore convert
>>>>> generic_hweight<N>() to out-of-line functions (without affecting Arm)
>>>>> and use alternatives patching to replace the function calls.
>>>>>
>>>>> Note that the approach doesn#t work for clang, due to it not recognizing
>>>>> -ffixed-*.
>>>> I've been giving this a look, and I wonder if it would be fine to
>>>> simply push and pop the scratch registers in the 'call' path of the
>>>> alternative, as that won't require any specific compiler option.
>>
>> Hmm, ...
>>
>>> It's been a long while, and in that time I've learnt a lot more about
>>> performance, but my root objection to the approach taken here still
>>> stands - it is penalising the common case to optimise some pointless
>>> corner cases.
>>>
>>> Yes - on the call path, an extra push/pop pair (or few) to get temp
>>> registers is basically free.
>>
>> ... what is "a few"? We'd need to push/pop all call-clobbered registers
>> except %rax, i.e. a total of eight. I consider this too much. Unless,
>> as you suggest further down, we wrote the fallback in assembly. Which I
>> have to admit I'm surprised you propose when we strive to reduce the
>> amount of assembly we have to maintain.
> 
> AMD added popcnt in 2007 and Intel in 2008.  While we shouldn't
> mandate popcnt support, I think we also shouldn't overly worry about
> the non-popcnt path.

We agree here.

> Also, how can you assert that the code generated without the scratch
> registers being usable won't be worse than the penalty of pushing and
> popping such registers on the stack and letting the routines use all
> registers freely?

Irrespective of the -ffixed-* the compiler can make itself sufficiently
many registers available to use, by preserving just the ones it actually
uses. So that's pretty certainly not more PUSH/POP than when we would
blindly preserve all which may need preserving (no matter whether
they're actually used).

Yet then both this question and ...

> I very much prefer to have a non-optimal non-popcnt path, but have
> popcnt support for both gcc and clang, and without requiring any
> compiler options.

... this makes me wonder whether we're thinking of fundamentally
different generated code that we would end up with. Since the
(apparently agreed upon) goal is to optimize for the "popcnt
available" case, mainline code should be not significantly longer that
what's needed for the single instruction itself, or alternatively a
CALL insn. Adding pushes/pops of all call clobbered registers around
the CALL would grow mainline code too much (for my taste), i.e.
irrespective of these PUSH/POP all getting NOP-ed out by alternatives
patching. (As an aside I consider fiddling with the stack pointer in
inline asm() risky, and hence I would prefer to avoid such whenever
possible. Yes, it can be written so it's independent of what the
compiler thinks the stack pointer points at, but such constructs are
fragile as soon as one wants to change them a little later on.)

Are you perhaps thinking of indeed having the PUSH/POP in mainline
code? Or some entirely different model?

What I could see us doing to accommodate Clang is to have wrappers
around the actual functions which do as you suggest and preserve all
relevant registers, no matter whether they're used. That'll still be
assembly code to maintain, but imo less of a concern than in Andrew's
model of writing hweight<N>() themselves in assembly (provided I
understood him correctly), for being purely mechanical operations.

Jan
Jürgen Groß March 21, 2023, 3:44 p.m. UTC | #17
On 21.03.23 16:35, Jan Beulich wrote:
> On 21.03.2023 15:57, Roger Pau Monné wrote:
>> On Mon, Mar 20, 2023 at 10:48:45AM +0100, Jan Beulich wrote:
>>> On 17.03.2023 13:26, Andrew Cooper wrote:
>>>> On 17/03/2023 11:22 am, Roger Pau Monné wrote:
>>>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
>>>>>> This is faster than using the software implementation, and the insn is
>>>>>> available on all half-way recent hardware. Therefore convert
>>>>>> generic_hweight<N>() to out-of-line functions (without affecting Arm)
>>>>>> and use alternatives patching to replace the function calls.
>>>>>>
>>>>>> Note that the approach doesn#t work for clang, due to it not recognizing
>>>>>> -ffixed-*.
>>>>> I've been giving this a look, and I wonder if it would be fine to
>>>>> simply push and pop the scratch registers in the 'call' path of the
>>>>> alternative, as that won't require any specific compiler option.
>>>
>>> Hmm, ...
>>>
>>>> It's been a long while, and in that time I've learnt a lot more about
>>>> performance, but my root objection to the approach taken here still
>>>> stands - it is penalising the common case to optimise some pointless
>>>> corner cases.
>>>>
>>>> Yes - on the call path, an extra push/pop pair (or few) to get temp
>>>> registers is basically free.
>>>
>>> ... what is "a few"? We'd need to push/pop all call-clobbered registers
>>> except %rax, i.e. a total of eight. I consider this too much. Unless,
>>> as you suggest further down, we wrote the fallback in assembly. Which I
>>> have to admit I'm surprised you propose when we strive to reduce the
>>> amount of assembly we have to maintain.
>>
>> AMD added popcnt in 2007 and Intel in 2008.  While we shouldn't
>> mandate popcnt support, I think we also shouldn't overly worry about
>> the non-popcnt path.
> 
> We agree here.
> 
>> Also, how can you assert that the code generated without the scratch
>> registers being usable won't be worse than the penalty of pushing and
>> popping such registers on the stack and letting the routines use all
>> registers freely?
> 
> Irrespective of the -ffixed-* the compiler can make itself sufficiently
> many registers available to use, by preserving just the ones it actually
> uses. So that's pretty certainly not more PUSH/POP than when we would
> blindly preserve all which may need preserving (no matter whether
> they're actually used).
> 
> Yet then both this question and ...
> 
>> I very much prefer to have a non-optimal non-popcnt path, but have
>> popcnt support for both gcc and clang, and without requiring any
>> compiler options.
> 
> ... this makes me wonder whether we're thinking of fundamentally
> different generated code that we would end up with. Since the
> (apparently agreed upon) goal is to optimize for the "popcnt
> available" case, mainline code should be not significantly longer that
> what's needed for the single instruction itself, or alternatively a
> CALL insn. Adding pushes/pops of all call clobbered registers around
> the CALL would grow mainline code too much (for my taste), i.e.
> irrespective of these PUSH/POP all getting NOP-ed out by alternatives
> patching. (As an aside I consider fiddling with the stack pointer in
> inline asm() risky, and hence I would prefer to avoid such whenever
> possible. Yes, it can be written so it's independent of what the
> compiler thinks the stack pointer points at, but such constructs are
> fragile as soon as one wants to change them a little later on.)
> 
> Are you perhaps thinking of indeed having the PUSH/POP in mainline
> code? Or some entirely different model?
> 
> What I could see us doing to accommodate Clang is to have wrappers
> around the actual functions which do as you suggest and preserve all
> relevant registers, no matter whether they're used.

You could just copy the PV_CALLEE_SAVE_REGS_THUNK() macro from the Linux
kernel, which is doing exactly that.


Juergen
Roger Pau Monné March 21, 2023, 4:31 p.m. UTC | #18
On Tue, Mar 21, 2023 at 04:35:54PM +0100, Jan Beulich wrote:
> On 21.03.2023 15:57, Roger Pau Monné wrote:
> > On Mon, Mar 20, 2023 at 10:48:45AM +0100, Jan Beulich wrote:
> >> On 17.03.2023 13:26, Andrew Cooper wrote:
> >>> On 17/03/2023 11:22 am, Roger Pau Monné wrote:
> >>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
> >>>>> This is faster than using the software implementation, and the insn is
> >>>>> available on all half-way recent hardware. Therefore convert
> >>>>> generic_hweight<N>() to out-of-line functions (without affecting Arm)
> >>>>> and use alternatives patching to replace the function calls.
> >>>>>
> >>>>> Note that the approach doesn#t work for clang, due to it not recognizing
> >>>>> -ffixed-*.
> >>>> I've been giving this a look, and I wonder if it would be fine to
> >>>> simply push and pop the scratch registers in the 'call' path of the
> >>>> alternative, as that won't require any specific compiler option.
> >>
> >> Hmm, ...
> >>
> >>> It's been a long while, and in that time I've learnt a lot more about
> >>> performance, but my root objection to the approach taken here still
> >>> stands - it is penalising the common case to optimise some pointless
> >>> corner cases.
> >>>
> >>> Yes - on the call path, an extra push/pop pair (or few) to get temp
> >>> registers is basically free.
> >>
> >> ... what is "a few"? We'd need to push/pop all call-clobbered registers
> >> except %rax, i.e. a total of eight. I consider this too much. Unless,
> >> as you suggest further down, we wrote the fallback in assembly. Which I
> >> have to admit I'm surprised you propose when we strive to reduce the
> >> amount of assembly we have to maintain.
> > 
> > AMD added popcnt in 2007 and Intel in 2008.  While we shouldn't
> > mandate popcnt support, I think we also shouldn't overly worry about
> > the non-popcnt path.
> 
> We agree here.
> 
> > Also, how can you assert that the code generated without the scratch
> > registers being usable won't be worse than the penalty of pushing and
> > popping such registers on the stack and letting the routines use all
> > registers freely?
> 
> Irrespective of the -ffixed-* the compiler can make itself sufficiently
> many registers available to use, by preserving just the ones it actually
> uses. So that's pretty certainly not more PUSH/POP than when we would
> blindly preserve all which may need preserving (no matter whether
> they're actually used).
> 
> Yet then both this question and ...
> 
> > I very much prefer to have a non-optimal non-popcnt path, but have
> > popcnt support for both gcc and clang, and without requiring any
> > compiler options.
> 
> ... this makes me wonder whether we're thinking of fundamentally
> different generated code that we would end up with. Since the
> (apparently agreed upon) goal is to optimize for the "popcnt
> available" case, mainline code should be not significantly longer that
> what's needed for the single instruction itself, or alternatively a
> CALL insn. Adding pushes/pops of all call clobbered registers around
> the CALL would grow mainline code too much (for my taste), i.e.
> irrespective of these PUSH/POP all getting NOP-ed out by alternatives
> patching. (As an aside I consider fiddling with the stack pointer in
> inline asm() risky, and hence I would prefer to avoid such whenever
> possible.

Is this because we are likely to not end up with a proper trace if we
mess up with the stack pointer before a function call?  I would like
to better understand your concerns with the stack pointer and inline
asm().

Other options would be using no_caller_saved_registers, but that's not
available in all supported gcc versions, likely the same with clang,
but I wouldn't mind upping the minimal clang version.

What about allocating the size of the registers that need saving as an
on-stack variable visible to the compiler and then moving to/from
there in the inline asm()?

> Yes, it can be written so it's independent of what the
> compiler thinks the stack pointer points at, but such constructs are
> fragile as soon as one wants to change them a little later on.)
> 
> Are you perhaps thinking of indeed having the PUSH/POP in mainline
> code? Or some entirely different model?
> 
> What I could see us doing to accommodate Clang is to have wrappers
> around the actual functions which do as you suggest and preserve all
> relevant registers, no matter whether they're used. That'll still be
> assembly code to maintain, but imo less of a concern than in Andrew's
> model of writing hweight<N>() themselves in assembly (provided I
> understood him correctly), for being purely mechanical operations.

If possible I would prefer to use the same model for both gcc and
clang, or else things tend to get more complicated than strictly
needed.

I also wonder whether we could also get away without disabling of
coverage and ubsan for the hweight object file.  But I assume as long
ass we do the function call in inline asm() (and thus kind of hidden
from the compiler) we need to disable any instrumentation because the
changed function context.  I don't think there's anyway we can
manually add the function call prefix/suffix in the inline asm()?

Thanks, Roger.
Jan Beulich March 21, 2023, 4:41 p.m. UTC | #19
On 21.03.2023 17:31, Roger Pau Monné wrote:
> On Tue, Mar 21, 2023 at 04:35:54PM +0100, Jan Beulich wrote:
>> On 21.03.2023 15:57, Roger Pau Monné wrote:
>>> On Mon, Mar 20, 2023 at 10:48:45AM +0100, Jan Beulich wrote:
>>>> On 17.03.2023 13:26, Andrew Cooper wrote:
>>>>> On 17/03/2023 11:22 am, Roger Pau Monné wrote:
>>>>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
>>>>>>> This is faster than using the software implementation, and the insn is
>>>>>>> available on all half-way recent hardware. Therefore convert
>>>>>>> generic_hweight<N>() to out-of-line functions (without affecting Arm)
>>>>>>> and use alternatives patching to replace the function calls.
>>>>>>>
>>>>>>> Note that the approach doesn#t work for clang, due to it not recognizing
>>>>>>> -ffixed-*.
>>>>>> I've been giving this a look, and I wonder if it would be fine to
>>>>>> simply push and pop the scratch registers in the 'call' path of the
>>>>>> alternative, as that won't require any specific compiler option.
>>>>
>>>> Hmm, ...
>>>>
>>>>> It's been a long while, and in that time I've learnt a lot more about
>>>>> performance, but my root objection to the approach taken here still
>>>>> stands - it is penalising the common case to optimise some pointless
>>>>> corner cases.
>>>>>
>>>>> Yes - on the call path, an extra push/pop pair (or few) to get temp
>>>>> registers is basically free.
>>>>
>>>> ... what is "a few"? We'd need to push/pop all call-clobbered registers
>>>> except %rax, i.e. a total of eight. I consider this too much. Unless,
>>>> as you suggest further down, we wrote the fallback in assembly. Which I
>>>> have to admit I'm surprised you propose when we strive to reduce the
>>>> amount of assembly we have to maintain.
>>>
>>> AMD added popcnt in 2007 and Intel in 2008.  While we shouldn't
>>> mandate popcnt support, I think we also shouldn't overly worry about
>>> the non-popcnt path.
>>
>> We agree here.
>>
>>> Also, how can you assert that the code generated without the scratch
>>> registers being usable won't be worse than the penalty of pushing and
>>> popping such registers on the stack and letting the routines use all
>>> registers freely?
>>
>> Irrespective of the -ffixed-* the compiler can make itself sufficiently
>> many registers available to use, by preserving just the ones it actually
>> uses. So that's pretty certainly not more PUSH/POP than when we would
>> blindly preserve all which may need preserving (no matter whether
>> they're actually used).
>>
>> Yet then both this question and ...
>>
>>> I very much prefer to have a non-optimal non-popcnt path, but have
>>> popcnt support for both gcc and clang, and without requiring any
>>> compiler options.
>>
>> ... this makes me wonder whether we're thinking of fundamentally
>> different generated code that we would end up with. Since the
>> (apparently agreed upon) goal is to optimize for the "popcnt
>> available" case, mainline code should be not significantly longer that
>> what's needed for the single instruction itself, or alternatively a
>> CALL insn. Adding pushes/pops of all call clobbered registers around
>> the CALL would grow mainline code too much (for my taste), i.e.
>> irrespective of these PUSH/POP all getting NOP-ed out by alternatives
>> patching. (As an aside I consider fiddling with the stack pointer in
>> inline asm() risky, and hence I would prefer to avoid such whenever
>> possible.
> 
> Is this because we are likely to not end up with a proper trace if we
> mess up with the stack pointer before a function call?

That's a related issue, but not what I was thinking about.

>  I would like
> to better understand your concerns with the stack pointer and inline
> asm().

You can't use local variables anymore with "m" or "rm" constraints, as
they might be accessed via %rsp.

> Other options would be using no_caller_saved_registers, but that's not
> available in all supported gcc versions, likely the same with clang,
> but I wouldn't mind upping the minimal clang version.

Right, you did suggest using this attribute before. But we continue to
support older gcc.

> What about allocating the size of the registers that need saving as an
> on-stack variable visible to the compiler and then moving to/from
> there in the inline asm()?

That would address the fiddling-with-%rsp concern, but would further
grow mainline code size.

>> Yes, it can be written so it's independent of what the
>> compiler thinks the stack pointer points at, but such constructs are
>> fragile as soon as one wants to change them a little later on.)
>>
>> Are you perhaps thinking of indeed having the PUSH/POP in mainline
>> code? Or some entirely different model?
>>
>> What I could see us doing to accommodate Clang is to have wrappers
>> around the actual functions which do as you suggest and preserve all
>> relevant registers, no matter whether they're used. That'll still be
>> assembly code to maintain, but imo less of a concern than in Andrew's
>> model of writing hweight<N>() themselves in assembly (provided I
>> understood him correctly), for being purely mechanical operations.
> 
> If possible I would prefer to use the same model for both gcc and
> clang, or else things tend to get more complicated than strictly
> needed.

We can always decide to accept the extra overhead even with gcc.

> I also wonder whether we could also get away without disabling of
> coverage and ubsan for the hweight object file.  But I assume as long
> ass we do the function call in inline asm() (and thus kind of hidden
> from the compiler) we need to disable any instrumentation because the
> changed function context.  I don't think there's anyway we can
> manually add the function call prefix/suffix in the inline asm()?

I don't know whether that would be possible (and portable across
versions). But what I'm more concerned about is that such functions
may also end up clobbering certain call-clobbered registers. (Note
that when writing these in assembly, as suggested by Andrew, no such
hooks would be used either.)

Jan
Roger Pau Monné March 21, 2023, 5:02 p.m. UTC | #20
On Tue, Mar 21, 2023 at 05:41:52PM +0100, Jan Beulich wrote:
> On 21.03.2023 17:31, Roger Pau Monné wrote:
> > On Tue, Mar 21, 2023 at 04:35:54PM +0100, Jan Beulich wrote:
> >> On 21.03.2023 15:57, Roger Pau Monné wrote:
> >>> On Mon, Mar 20, 2023 at 10:48:45AM +0100, Jan Beulich wrote:
> >>>> On 17.03.2023 13:26, Andrew Cooper wrote:
> >>>>> On 17/03/2023 11:22 am, Roger Pau Monné wrote:
> >>>>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote:
> >>>>>>> This is faster than using the software implementation, and the insn is
> >>>>>>> available on all half-way recent hardware. Therefore convert
> >>>>>>> generic_hweight<N>() to out-of-line functions (without affecting Arm)
> >>>>>>> and use alternatives patching to replace the function calls.
> >>>>>>>
> >>>>>>> Note that the approach doesn#t work for clang, due to it not recognizing
> >>>>>>> -ffixed-*.
> >>>>>> I've been giving this a look, and I wonder if it would be fine to
> >>>>>> simply push and pop the scratch registers in the 'call' path of the
> >>>>>> alternative, as that won't require any specific compiler option.
> >>>>
> >>>> Hmm, ...
> >>>>
> >>>>> It's been a long while, and in that time I've learnt a lot more about
> >>>>> performance, but my root objection to the approach taken here still
> >>>>> stands - it is penalising the common case to optimise some pointless
> >>>>> corner cases.
> >>>>>
> >>>>> Yes - on the call path, an extra push/pop pair (or few) to get temp
> >>>>> registers is basically free.
> >>>>
> >>>> ... what is "a few"? We'd need to push/pop all call-clobbered registers
> >>>> except %rax, i.e. a total of eight. I consider this too much. Unless,
> >>>> as you suggest further down, we wrote the fallback in assembly. Which I
> >>>> have to admit I'm surprised you propose when we strive to reduce the
> >>>> amount of assembly we have to maintain.
> >>>
> >>> AMD added popcnt in 2007 and Intel in 2008.  While we shouldn't
> >>> mandate popcnt support, I think we also shouldn't overly worry about
> >>> the non-popcnt path.
> >>
> >> We agree here.
> >>
> >>> Also, how can you assert that the code generated without the scratch
> >>> registers being usable won't be worse than the penalty of pushing and
> >>> popping such registers on the stack and letting the routines use all
> >>> registers freely?
> >>
> >> Irrespective of the -ffixed-* the compiler can make itself sufficiently
> >> many registers available to use, by preserving just the ones it actually
> >> uses. So that's pretty certainly not more PUSH/POP than when we would
> >> blindly preserve all which may need preserving (no matter whether
> >> they're actually used).
> >>
> >> Yet then both this question and ...
> >>
> >>> I very much prefer to have a non-optimal non-popcnt path, but have
> >>> popcnt support for both gcc and clang, and without requiring any
> >>> compiler options.
> >>
> >> ... this makes me wonder whether we're thinking of fundamentally
> >> different generated code that we would end up with. Since the
> >> (apparently agreed upon) goal is to optimize for the "popcnt
> >> available" case, mainline code should be not significantly longer that
> >> what's needed for the single instruction itself, or alternatively a
> >> CALL insn. Adding pushes/pops of all call clobbered registers around
> >> the CALL would grow mainline code too much (for my taste), i.e.
> >> irrespective of these PUSH/POP all getting NOP-ed out by alternatives
> >> patching. (As an aside I consider fiddling with the stack pointer in
> >> inline asm() risky, and hence I would prefer to avoid such whenever
> >> possible.
> > 
> > Is this because we are likely to not end up with a proper trace if we
> > mess up with the stack pointer before a function call?
> 
> That's a related issue, but not what I was thinking about.
> 
> >  I would like
> > to better understand your concerns with the stack pointer and inline
> > asm().
> 
> You can't use local variables anymore with "m" or "rm" constraints, as
> they might be accessed via %rsp.
> 
> > Other options would be using no_caller_saved_registers, but that's not
> > available in all supported gcc versions, likely the same with clang,
> > but I wouldn't mind upping the minimal clang version.
> 
> Right, you did suggest using this attribute before. But we continue to
> support older gcc.

FWIW, I would be fine with not enabling the optimization if the
attribute is not available, but then again this means adding more
logic.  At the end this is just an optimization, so I think it's fine
to request a version of gcc greater than the supported baseline.

> > What about allocating the size of the registers that need saving as an
> > on-stack variable visible to the compiler and then moving to/from
> > there in the inline asm()?
> 
> That would address the fiddling-with-%rsp concern, but would further
> grow mainline code size.

I'm fine with such growth, unless you can prove the growth in code
size shadows the performance win from the usage of popcnt.

> >> Yes, it can be written so it's independent of what the
> >> compiler thinks the stack pointer points at, but such constructs are
> >> fragile as soon as one wants to change them a little later on.)
> >>
> >> Are you perhaps thinking of indeed having the PUSH/POP in mainline
> >> code? Or some entirely different model?
> >>
> >> What I could see us doing to accommodate Clang is to have wrappers
> >> around the actual functions which do as you suggest and preserve all
> >> relevant registers, no matter whether they're used. That'll still be
> >> assembly code to maintain, but imo less of a concern than in Andrew's
> >> model of writing hweight<N>() themselves in assembly (provided I
> >> understood him correctly), for being purely mechanical operations.
> > 
> > If possible I would prefer to use the same model for both gcc and
> > clang, or else things tend to get more complicated than strictly
> > needed.
> 
> We can always decide to accept the extra overhead even with gcc.
> 
> > I also wonder whether we could also get away without disabling of
> > coverage and ubsan for the hweight object file.  But I assume as long
> > ass we do the function call in inline asm() (and thus kind of hidden
> > from the compiler) we need to disable any instrumentation because the
> > changed function context.  I don't think there's anyway we can
> > manually add the function call prefix/suffix in the inline asm()?
> 
> I don't know whether that would be possible (and portable across
> versions). But what I'm more concerned about is that such functions
> may also end up clobbering certain call-clobbered registers. (Note
> that when writing these in assembly, as suggested by Andrew, no such
> hooks would be used either.)

Right, we would just pick the Linux assembly for those helpers.  I
have to admit it feels weird, because I generally try to avoid growing
our usage of assembly, and hence this would seem like a step backwards
towards that goal.

Thanks, Roger.
diff mbox series

Patch

--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -31,6 +31,10 @@  obj-y += emul-i8254.o
  obj-y += extable.o
  obj-y += flushtlb.o
  obj-$(CONFIG_CRASH_DEBUG) += gdbstub.o
+# clang doesn't appear to know of -ffixed-*
+hweight-$(gcc) := hweight.o
+hweight-$(clang) :=
+obj-y += $(hweight-y)
  obj-y += hypercall.o
  obj-y += i387.o
  obj-y += i8259.o
@@ -251,6 +255,10 @@  boot/mkelf32: boot/mkelf32.c
  efi/mkreloc: efi/mkreloc.c
  	$(HOSTCC) $(HOSTCFLAGS) -g -o $@ $<
  
+nocov-y += hweight.o
+noubsan-y += hweight.o
+hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg))
+
  .PHONY: clean
  clean::
  	rm -f asm-offsets.s *.lds boot/*.o boot/*~ boot/core boot/mkelf32
--- /dev/null
+++ b/xen/arch/x86/hweight.c
@@ -0,0 +1,21 @@ 
+#define generic_hweight64 _hweight64
+#define generic_hweight32 _hweight32
+#define generic_hweight16 _hweight16
+#define generic_hweight8  _hweight8
+
+#include <xen/compiler.h>
+
+#undef inline
+#define inline always_inline
+
+#include <xen/bitops.h>
+
+#undef generic_hweight8
+#undef generic_hweight16
+#undef generic_hweight32
+#undef generic_hweight64
+
+unsigned int generic_hweight8 (unsigned int x) { return _hweight8 (x); }
+unsigned int generic_hweight16(unsigned int x) { return _hweight16(x); }
+unsigned int generic_hweight32(unsigned int x) { return _hweight32(x); }
+unsigned int generic_hweight64(uint64_t x)     { return _hweight64(x); }
--- a/xen/include/asm-x86/bitops.h
+++ b/xen/include/asm-x86/bitops.h
@@ -475,9 +475,36 @@  static inline int fls(unsigned int x)
   *
   * The Hamming Weight of a number is the total number of bits set in it.
   */
+#ifndef __clang__
+/* POPCNT encodings with %{r,e}di input and %{r,e}ax output: */
+#define POPCNT_64 ".byte 0xF3, 0x48, 0x0F, 0xB8, 0xC7"
+#define POPCNT_32 ".byte 0xF3, 0x0F, 0xB8, 0xC7"
+
+#define hweight_(n, x, insn, setup, cout, cin) ({                         \
+    unsigned int res_;                                                    \
+    /*                                                                    \
+     * For the function call the POPCNT input register needs to be marked \
+     * modified as well. Set up a local variable of appropriate type      \
+     * for this purpose.                                                  \
+     */                                                                   \
+    typeof((uint##n##_t)(x) + 0U) val_ = (x);                             \
+    alternative_io(setup "; call generic_hweight" #n,                     \
+                   insn, X86_FEATURE_POPCNT,                              \
+                   ASM_OUTPUT2([res] "=a" (res_), [val] cout (val_)),     \
+                   [src] cin (val_));                                     \
+    res_;                                                                 \
+})
+#define hweight64(x) hweight_(64, x, POPCNT_64, "", "+D", "g")
+#define hweight32(x) hweight_(32, x, POPCNT_32, "", "+D", "g")
+#define hweight16(x) hweight_(16, x, "movzwl %w[src], %[val]; " POPCNT_32, \
+                              "mov %[src], %[val]", "=&D", "rm")
+#define hweight8(x)  hweight_( 8, x, "movzbl %b[src], %[val]; " POPCNT_32, \
+                              "mov %[src], %[val]", "=&D", "rm")
+#else
  #define hweight64(x) generic_hweight64(x)
  #define hweight32(x) generic_hweight32(x)
  #define hweight16(x) generic_hweight16(x)
  #define hweight8(x) generic_hweight8(x)
+#endif
  
  #endif /* _X86_BITOPS_H */