Message ID | 55a4a24d-7fac-527c-6bcf-8d689136bac2@suse.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v2] x86: use POPCNT for hweight<N>() when available | expand |
On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: > This is faster than using the software implementation, and the insn is > available on all half-way recent hardware. Therefore convert > generic_hweight<N>() to out-of-line functions (without affecting Arm) > and use alternatives patching to replace the function calls. > > Note that the approach doesn#t work for clang, due to it not recognizing > -ffixed-*. > > Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com> > Signed-off-by: Jan Beulich <jbeulich@suse.com> > --- > v2: Also suppress UB sanitizer instrumentation. Reduce macroization in > hweight.c. Exclude clang builds. > --- > Note: Using "g" instead of "X" as the dummy constraint in hweight64() > and hweight32(), other than expected, produces slightly better > code with gcc 8. > > --- a/xen/arch/x86/Makefile > +++ b/xen/arch/x86/Makefile > @@ -31,6 +31,10 @@ obj-y += emul-i8254.o > obj-y += extable.o > obj-y += flushtlb.o > obj-$(CONFIG_CRASH_DEBUG) += gdbstub.o > +# clang doesn't appear to know of -ffixed-* > +hweight-$(gcc) := hweight.o > +hweight-$(clang) := > +obj-y += $(hweight-y) > obj-y += hypercall.o > obj-y += i387.o > obj-y += i8259.o > @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c > efi/mkreloc: efi/mkreloc.c > $(HOSTCC) $(HOSTCFLAGS) -g -o $@ $< > > +nocov-y += hweight.o > +noubsan-y += hweight.o > +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg)) Why not use clobbers in the asm to list the scratch registers? Is it that much expensive? > + > .PHONY: clean > clean:: > rm -f asm-offsets.s *.lds boot/*.o boot/*~ boot/core boot/mkelf32 > --- /dev/null > +++ b/xen/arch/x86/hweight.c > @@ -0,0 +1,21 @@ > +#define generic_hweight64 _hweight64 > +#define generic_hweight32 _hweight32 > +#define generic_hweight16 _hweight16 > +#define generic_hweight8 _hweight8 > + > +#include <xen/compiler.h> > + > +#undef inline > +#define inline always_inline > + > +#include <xen/bitops.h> > + > +#undef generic_hweight8 > +#undef generic_hweight16 > +#undef generic_hweight32 > +#undef generic_hweight64 > + > +unsigned int generic_hweight8 (unsigned int x) { return _hweight8 (x); } > +unsigned int generic_hweight16(unsigned int x) { return _hweight16(x); } > +unsigned int generic_hweight32(unsigned int x) { return _hweight32(x); } > +unsigned int generic_hweight64(uint64_t x) { return _hweight64(x); } > --- a/xen/include/asm-x86/bitops.h > +++ b/xen/include/asm-x86/bitops.h > @@ -475,9 +475,36 @@ static inline int fls(unsigned int x) > * > * The Hamming Weight of a number is the total number of bits set in it. > */ > +#ifndef __clang__ > +/* POPCNT encodings with %{r,e}di input and %{r,e}ax output: */ > +#define POPCNT_64 ".byte 0xF3, 0x48, 0x0F, 0xB8, 0xC7" > +#define POPCNT_32 ".byte 0xF3, 0x0F, 0xB8, 0xC7" > + > +#define hweight_(n, x, insn, setup, cout, cin) ({ \ > + unsigned int res_; \ > + /* \ > + * For the function call the POPCNT input register needs to be marked \ > + * modified as well. Set up a local variable of appropriate type \ > + * for this purpose. \ > + */ \ > + typeof((uint##n##_t)(x) + 0U) val_ = (x); \ > + alternative_io(setup "; call generic_hweight" #n, \ > + insn, X86_FEATURE_POPCNT, \ > + ASM_OUTPUT2([res] "=a" (res_), [val] cout (val_)), \ > + [src] cin (val_)); \ > + res_; \ > +}) > +#define hweight64(x) hweight_(64, x, POPCNT_64, "", "+D", "g") > +#define hweight32(x) hweight_(32, x, POPCNT_32, "", "+D", "g") > +#define hweight16(x) hweight_(16, x, "movzwl %w[src], %[val]; " POPCNT_32, \ > + "mov %[src], %[val]", "=&D", "rm") > +#define hweight8(x) hweight_( 8, x, "movzbl %b[src], %[val]; " POPCNT_32, \ > + "mov %[src], %[val]", "=&D", "rm") Why not just convert types < 32bits into uint32_t and avoid the asm prefix? You are already converting them in hweight_ to an uintX_t. That would made the asm simpler as you would then only need to make sure the input is in %rdi and the output is fetched from %rax? Thanks, Roger.
On 14.05.2020 16:05, Roger Pau Monné wrote: > On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: >> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c >> efi/mkreloc: efi/mkreloc.c >> $(HOSTCC) $(HOSTCFLAGS) -g -o $@ $< >> >> +nocov-y += hweight.o >> +noubsan-y += hweight.o >> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg)) > > Why not use clobbers in the asm to list the scratch registers? Is it > that much expensive? The goal is to disturb the call sites as little as possible. There's no point avoiding the scratch registers when no call is made (i.e. when the POPCNT insn can be used). Taking away from the compiler 7 out of 15 registers (that it can hold live data in) seems quite a lot to me. >> --- a/xen/include/asm-x86/bitops.h >> +++ b/xen/include/asm-x86/bitops.h >> @@ -475,9 +475,36 @@ static inline int fls(unsigned int x) >> * >> * The Hamming Weight of a number is the total number of bits set in it. >> */ >> +#ifndef __clang__ >> +/* POPCNT encodings with %{r,e}di input and %{r,e}ax output: */ >> +#define POPCNT_64 ".byte 0xF3, 0x48, 0x0F, 0xB8, 0xC7" >> +#define POPCNT_32 ".byte 0xF3, 0x0F, 0xB8, 0xC7" >> + >> +#define hweight_(n, x, insn, setup, cout, cin) ({ \ >> + unsigned int res_; \ >> + /* \ >> + * For the function call the POPCNT input register needs to be marked \ >> + * modified as well. Set up a local variable of appropriate type \ >> + * for this purpose. \ >> + */ \ >> + typeof((uint##n##_t)(x) + 0U) val_ = (x); \ >> + alternative_io(setup "; call generic_hweight" #n, \ >> + insn, X86_FEATURE_POPCNT, \ >> + ASM_OUTPUT2([res] "=a" (res_), [val] cout (val_)), \ >> + [src] cin (val_)); \ >> + res_; \ >> +}) >> +#define hweight64(x) hweight_(64, x, POPCNT_64, "", "+D", "g") >> +#define hweight32(x) hweight_(32, x, POPCNT_32, "", "+D", "g") >> +#define hweight16(x) hweight_(16, x, "movzwl %w[src], %[val]; " POPCNT_32, \ >> + "mov %[src], %[val]", "=&D", "rm") >> +#define hweight8(x) hweight_( 8, x, "movzbl %b[src], %[val]; " POPCNT_32, \ >> + "mov %[src], %[val]", "=&D", "rm") > > Why not just convert types < 32bits into uint32_t and avoid the asm > prefix? You are already converting them in hweight_ to an uintX_t. I don't think I do - there's a conversion to the promoted type when adding in an unsigned int value (which is there to retain uint64_t for the 64-bit case, while using unsigned int for all smaller sizes, as per the parameter types of generic_hweight*()). > That would made the asm simpler as you would then only need to make > sure the input is in %rdi and the output is fetched from %rax? That's an option, yes, but I again wanted to limit the impact to generated code (including code size) as much as possible. generic_hweight{8,16}() take unsigned int arguments, not uint{8,16}_t ones. Hence at the call site (when a function call is needed) no zero extension is necessary. Therefore the MOVZ above is to (mainly) overlay the MOV during patching, while the POPCNT is to (mainly) overlay the CALL. If the simpler asm()-s were considered more important than the quality of the generated code, I think we could simply have #define hweight16(x) hweight32((uint16_t)(x)) #define hweight8(x) hweight32(( uint8_t)(x)) Jan
On Wed, May 20, 2020 at 10:31:38AM +0200, Jan Beulich wrote: > On 14.05.2020 16:05, Roger Pau Monné wrote: > > On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: > >> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c > >> efi/mkreloc: efi/mkreloc.c > >> $(HOSTCC) $(HOSTCFLAGS) -g -o $@ $< > >> > >> +nocov-y += hweight.o > >> +noubsan-y += hweight.o > >> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg)) > > > > Why not use clobbers in the asm to list the scratch registers? Is it > > that much expensive? > > The goal is to disturb the call sites as little as possible. There's > no point avoiding the scratch registers when no call is made (i.e. > when the POPCNT insn can be used). Taking away from the compiler 7 > out of 15 registers (that it can hold live data in) seems quite a > lot to me. IMO using -ffixed-reg for all those registers is even worse, as that prevents the generated code in hweight from using any of those, thus greatly limiting the amount of registers and likely making the generated code rely heavily on pushing an popping from the stack? This also has the side effect to limiting the usage of popcnt to gcc, which IMO is also not ideal. I also wondered, since the in-place asm before patching is a call instruction, wouldn't inline asm at build time already assume that the scratch registers are clobbered? > >> --- a/xen/include/asm-x86/bitops.h > >> +++ b/xen/include/asm-x86/bitops.h > >> @@ -475,9 +475,36 @@ static inline int fls(unsigned int x) > >> * > >> * The Hamming Weight of a number is the total number of bits set in it. > >> */ > >> +#ifndef __clang__ > >> +/* POPCNT encodings with %{r,e}di input and %{r,e}ax output: */ > >> +#define POPCNT_64 ".byte 0xF3, 0x48, 0x0F, 0xB8, 0xC7" > >> +#define POPCNT_32 ".byte 0xF3, 0x0F, 0xB8, 0xC7" > >> + > >> +#define hweight_(n, x, insn, setup, cout, cin) ({ \ > >> + unsigned int res_; \ > >> + /* \ > >> + * For the function call the POPCNT input register needs to be marked \ > >> + * modified as well. Set up a local variable of appropriate type \ > >> + * for this purpose. \ > >> + */ \ > >> + typeof((uint##n##_t)(x) + 0U) val_ = (x); \ > >> + alternative_io(setup "; call generic_hweight" #n, \ > >> + insn, X86_FEATURE_POPCNT, \ > >> + ASM_OUTPUT2([res] "=a" (res_), [val] cout (val_)), \ > >> + [src] cin (val_)); \ > >> + res_; \ > >> +}) > >> +#define hweight64(x) hweight_(64, x, POPCNT_64, "", "+D", "g") > >> +#define hweight32(x) hweight_(32, x, POPCNT_32, "", "+D", "g") > >> +#define hweight16(x) hweight_(16, x, "movzwl %w[src], %[val]; " POPCNT_32, \ > >> + "mov %[src], %[val]", "=&D", "rm") > >> +#define hweight8(x) hweight_( 8, x, "movzbl %b[src], %[val]; " POPCNT_32, \ > >> + "mov %[src], %[val]", "=&D", "rm") > > > > Why not just convert types < 32bits into uint32_t and avoid the asm > > prefix? You are already converting them in hweight_ to an uintX_t. > > I don't think I do - there's a conversion to the promoted type > when adding in an unsigned int value (which is there to retain > uint64_t for the 64-bit case, while using unsigned int for all > smaller sizes, as per the parameter types of generic_hweight*()). > > > That would made the asm simpler as you would then only need to make > > sure the input is in %rdi and the output is fetched from %rax? > > That's an option, yes, but I again wanted to limit the impact to > generated code (including code size) as much as possible. > generic_hweight{8,16}() take unsigned int arguments, not > uint{8,16}_t ones. Hence at the call site (when a function call > is needed) no zero extension is necessary. Therefore the MOVZ > above is to (mainly) overlay the MOV during patching, while the > POPCNT is to (mainly) overlay the CALL. > > If the simpler asm()-s were considered more important than the > quality of the generated code, I think we could simply have > > #define hweight16(x) hweight32((uint16_t)(x)) > #define hweight8(x) hweight32(( uint8_t)(x)) I would definitely prefer simpler asm for sure, so getting rid of the prefix would be a clear +1 from me. Unless the prefixed version has a measurable performance impact during normal operation. Thanks, Roger.
On 20.05.2020 11:31, Roger Pau Monné wrote: > On Wed, May 20, 2020 at 10:31:38AM +0200, Jan Beulich wrote: >> On 14.05.2020 16:05, Roger Pau Monné wrote: >>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: >>>> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c >>>> efi/mkreloc: efi/mkreloc.c >>>> $(HOSTCC) $(HOSTCFLAGS) -g -o $@ $< >>>> >>>> +nocov-y += hweight.o >>>> +noubsan-y += hweight.o >>>> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg)) >>> >>> Why not use clobbers in the asm to list the scratch registers? Is it >>> that much expensive? >> >> The goal is to disturb the call sites as little as possible. There's >> no point avoiding the scratch registers when no call is made (i.e. >> when the POPCNT insn can be used). Taking away from the compiler 7 >> out of 15 registers (that it can hold live data in) seems quite a >> lot to me. > > IMO using -ffixed-reg for all those registers is even worse, as that > prevents the generated code in hweight from using any of those, thus > greatly limiting the amount of registers and likely making the > generated code rely heavily on pushing an popping from the stack? Okay, that's the other side of the idea behind all this: Virtually no hardware we run on will lack POPCNT support, hence the quality of these fallback routines matters only the very old hardware, where we likely don't perform optimally already anyway. > This also has the side effect to limiting the usage of popcnt to gcc, > which IMO is also not ideal. Agreed. I don't know enough about clang to be able to think of possible alternatives. In any event there's no change to current behavior for hypervisors built with clang. > I also wondered, since the in-place asm before patching is a call > instruction, wouldn't inline asm at build time already assume that the > scratch registers are clobbered? That would imply the compiler peeks into the string literal of the asm(). At least gcc doesn't, and even if it did it couldn't infer an ABI from seeing a CALL insn. Jan
On Wed, May 20, 2020 at 12:17:15PM +0200, Jan Beulich wrote: > On 20.05.2020 11:31, Roger Pau Monné wrote: > > On Wed, May 20, 2020 at 10:31:38AM +0200, Jan Beulich wrote: > >> On 14.05.2020 16:05, Roger Pau Monné wrote: > >>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: > >>>> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c > >>>> efi/mkreloc: efi/mkreloc.c > >>>> $(HOSTCC) $(HOSTCFLAGS) -g -o $@ $< > >>>> > >>>> +nocov-y += hweight.o > >>>> +noubsan-y += hweight.o > >>>> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg)) > >>> > >>> Why not use clobbers in the asm to list the scratch registers? Is it > >>> that much expensive? > >> > >> The goal is to disturb the call sites as little as possible. There's > >> no point avoiding the scratch registers when no call is made (i.e. > >> when the POPCNT insn can be used). Taking away from the compiler 7 > >> out of 15 registers (that it can hold live data in) seems quite a > >> lot to me. > > > > IMO using -ffixed-reg for all those registers is even worse, as that > > prevents the generated code in hweight from using any of those, thus > > greatly limiting the amount of registers and likely making the > > generated code rely heavily on pushing an popping from the stack? > > Okay, that's the other side of the idea behind all this: Virtually no > hardware we run on will lack POPCNT support, hence the quality of > these fallback routines matters only the very old hardware, where we > likely don't perform optimally already anyway. > > > This also has the side effect to limiting the usage of popcnt to gcc, > > which IMO is also not ideal. > > Agreed. I don't know enough about clang to be able to think of > possible alternatives. In any event there's no change to current > behavior for hypervisors built with clang. > > > I also wondered, since the in-place asm before patching is a call > > instruction, wouldn't inline asm at build time already assume that the > > scratch registers are clobbered? > > That would imply the compiler peeks into the string literal of the > asm(). At least gcc doesn't, and even if it did it couldn't infer an > ABI from seeing a CALL insn. Please bear with me, but then I don't understand what Linux is doing in arch/x86/include/asm/arch_hweight.h. I see no clobbers there, neither it seems like the __sw_hweight{32/64} functions are built without the usage of the scratch registers. Roger.
On 20.05.2020 12:28, Roger Pau Monné wrote: > On Wed, May 20, 2020 at 12:17:15PM +0200, Jan Beulich wrote: >> On 20.05.2020 11:31, Roger Pau Monné wrote: >>> On Wed, May 20, 2020 at 10:31:38AM +0200, Jan Beulich wrote: >>>> On 14.05.2020 16:05, Roger Pau Monné wrote: >>>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: >>>>>> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c >>>>>> efi/mkreloc: efi/mkreloc.c >>>>>> $(HOSTCC) $(HOSTCFLAGS) -g -o $@ $< >>>>>> >>>>>> +nocov-y += hweight.o >>>>>> +noubsan-y += hweight.o >>>>>> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg)) >>>>> >>>>> Why not use clobbers in the asm to list the scratch registers? Is it >>>>> that much expensive? >>>> >>>> The goal is to disturb the call sites as little as possible. There's >>>> no point avoiding the scratch registers when no call is made (i.e. >>>> when the POPCNT insn can be used). Taking away from the compiler 7 >>>> out of 15 registers (that it can hold live data in) seems quite a >>>> lot to me. >>> >>> IMO using -ffixed-reg for all those registers is even worse, as that >>> prevents the generated code in hweight from using any of those, thus >>> greatly limiting the amount of registers and likely making the >>> generated code rely heavily on pushing an popping from the stack? >> >> Okay, that's the other side of the idea behind all this: Virtually no >> hardware we run on will lack POPCNT support, hence the quality of >> these fallback routines matters only the very old hardware, where we >> likely don't perform optimally already anyway. >> >>> This also has the side effect to limiting the usage of popcnt to gcc, >>> which IMO is also not ideal. >> >> Agreed. I don't know enough about clang to be able to think of >> possible alternatives. In any event there's no change to current >> behavior for hypervisors built with clang. >> >>> I also wondered, since the in-place asm before patching is a call >>> instruction, wouldn't inline asm at build time already assume that the >>> scratch registers are clobbered? >> >> That would imply the compiler peeks into the string literal of the >> asm(). At least gcc doesn't, and even if it did it couldn't infer an >> ABI from seeing a CALL insn. > > Please bear with me, but then I don't understand what Linux is doing > in arch/x86/include/asm/arch_hweight.h. I see no clobbers there, > neither it seems like the __sw_hweight{32/64} functions are built > without the usage of the scratch registers. __sw_hweight{32,64} are implemented in assembly, avoiding most scratch registers while pushing/popping the ones which do get altered. Jan
On Wed, May 20, 2020 at 12:57:27PM +0200, Jan Beulich wrote: > On 20.05.2020 12:28, Roger Pau Monné wrote: > > On Wed, May 20, 2020 at 12:17:15PM +0200, Jan Beulich wrote: > >> On 20.05.2020 11:31, Roger Pau Monné wrote: > >>> On Wed, May 20, 2020 at 10:31:38AM +0200, Jan Beulich wrote: > >>>> On 14.05.2020 16:05, Roger Pau Monné wrote: > >>>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: > >>>>>> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c > >>>>>> efi/mkreloc: efi/mkreloc.c > >>>>>> $(HOSTCC) $(HOSTCFLAGS) -g -o $@ $< > >>>>>> > >>>>>> +nocov-y += hweight.o > >>>>>> +noubsan-y += hweight.o > >>>>>> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg)) > >>>>> > >>>>> Why not use clobbers in the asm to list the scratch registers? Is it > >>>>> that much expensive? > >>>> > >>>> The goal is to disturb the call sites as little as possible. There's > >>>> no point avoiding the scratch registers when no call is made (i.e. > >>>> when the POPCNT insn can be used). Taking away from the compiler 7 > >>>> out of 15 registers (that it can hold live data in) seems quite a > >>>> lot to me. > >>> > >>> IMO using -ffixed-reg for all those registers is even worse, as that > >>> prevents the generated code in hweight from using any of those, thus > >>> greatly limiting the amount of registers and likely making the > >>> generated code rely heavily on pushing an popping from the stack? > >> > >> Okay, that's the other side of the idea behind all this: Virtually no > >> hardware we run on will lack POPCNT support, hence the quality of > >> these fallback routines matters only the very old hardware, where we > >> likely don't perform optimally already anyway. > >> > >>> This also has the side effect to limiting the usage of popcnt to gcc, > >>> which IMO is also not ideal. > >> > >> Agreed. I don't know enough about clang to be able to think of > >> possible alternatives. In any event there's no change to current > >> behavior for hypervisors built with clang. > >> > >>> I also wondered, since the in-place asm before patching is a call > >>> instruction, wouldn't inline asm at build time already assume that the > >>> scratch registers are clobbered? > >> > >> That would imply the compiler peeks into the string literal of the > >> asm(). At least gcc doesn't, and even if it did it couldn't infer an > >> ABI from seeing a CALL insn. > > > > Please bear with me, but then I don't understand what Linux is doing > > in arch/x86/include/asm/arch_hweight.h. I see no clobbers there, > > neither it seems like the __sw_hweight{32/64} functions are built > > without the usage of the scratch registers. > > __sw_hweight{32,64} are implemented in assembly, avoiding most > scratch registers while pushing/popping the ones which do get > altered. Oh right, I was looking at lib/hweight.c instead of the arch one. Would you agree to use the no_caller_saved_registers attribute (which is available AFAICT for both gcc and clang) for generic_hweightXX and then remove the asm prefix code in favour of the defines for hweight{8/16}? I think that would make it easier to read. Thanks, Roger.
On 20.05.2020 13:43, Roger Pau Monné wrote: > On Wed, May 20, 2020 at 12:57:27PM +0200, Jan Beulich wrote: >> On 20.05.2020 12:28, Roger Pau Monné wrote: >>> On Wed, May 20, 2020 at 12:17:15PM +0200, Jan Beulich wrote: >>>> On 20.05.2020 11:31, Roger Pau Monné wrote: >>>>> On Wed, May 20, 2020 at 10:31:38AM +0200, Jan Beulich wrote: >>>>>> On 14.05.2020 16:05, Roger Pau Monné wrote: >>>>>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: >>>>>>>> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c >>>>>>>> efi/mkreloc: efi/mkreloc.c >>>>>>>> $(HOSTCC) $(HOSTCFLAGS) -g -o $@ $< >>>>>>>> >>>>>>>> +nocov-y += hweight.o >>>>>>>> +noubsan-y += hweight.o >>>>>>>> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg)) >>>>>>> >>>>>>> Why not use clobbers in the asm to list the scratch registers? Is it >>>>>>> that much expensive? >>>>>> >>>>>> The goal is to disturb the call sites as little as possible. There's >>>>>> no point avoiding the scratch registers when no call is made (i.e. >>>>>> when the POPCNT insn can be used). Taking away from the compiler 7 >>>>>> out of 15 registers (that it can hold live data in) seems quite a >>>>>> lot to me. >>>>> >>>>> IMO using -ffixed-reg for all those registers is even worse, as that >>>>> prevents the generated code in hweight from using any of those, thus >>>>> greatly limiting the amount of registers and likely making the >>>>> generated code rely heavily on pushing an popping from the stack? >>>> >>>> Okay, that's the other side of the idea behind all this: Virtually no >>>> hardware we run on will lack POPCNT support, hence the quality of >>>> these fallback routines matters only the very old hardware, where we >>>> likely don't perform optimally already anyway. >>>> >>>>> This also has the side effect to limiting the usage of popcnt to gcc, >>>>> which IMO is also not ideal. >>>> >>>> Agreed. I don't know enough about clang to be able to think of >>>> possible alternatives. In any event there's no change to current >>>> behavior for hypervisors built with clang. >>>> >>>>> I also wondered, since the in-place asm before patching is a call >>>>> instruction, wouldn't inline asm at build time already assume that the >>>>> scratch registers are clobbered? >>>> >>>> That would imply the compiler peeks into the string literal of the >>>> asm(). At least gcc doesn't, and even if it did it couldn't infer an >>>> ABI from seeing a CALL insn. >>> >>> Please bear with me, but then I don't understand what Linux is doing >>> in arch/x86/include/asm/arch_hweight.h. I see no clobbers there, >>> neither it seems like the __sw_hweight{32/64} functions are built >>> without the usage of the scratch registers. >> >> __sw_hweight{32,64} are implemented in assembly, avoiding most >> scratch registers while pushing/popping the ones which do get >> altered. > > Oh right, I was looking at lib/hweight.c instead of the arch one. > > Would you agree to use the no_caller_saved_registers attribute (which > is available AFAICT for both gcc and clang) for generic_hweightXX and > then remove the asm prefix code in favour of the defines for > hweight{8/16}? At least for gcc no_caller_saved_registers isn't old enough to be used unconditionally (nor is its companion -mgeneral-regs-only). If you tell me it's fine to use unconditionally with clang, then I can see about making this the preferred variant, with the present one as a fallback. Jan
On Wed, May 20, 2020 at 03:12:25PM +0200, Jan Beulich wrote: > On 20.05.2020 13:43, Roger Pau Monné wrote: > > On Wed, May 20, 2020 at 12:57:27PM +0200, Jan Beulich wrote: > >> On 20.05.2020 12:28, Roger Pau Monné wrote: > >>> On Wed, May 20, 2020 at 12:17:15PM +0200, Jan Beulich wrote: > >>>> On 20.05.2020 11:31, Roger Pau Monné wrote: > >>>>> On Wed, May 20, 2020 at 10:31:38AM +0200, Jan Beulich wrote: > >>>>>> On 14.05.2020 16:05, Roger Pau Monné wrote: > >>>>>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: > >>>>>>>> @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c > >>>>>>>> efi/mkreloc: efi/mkreloc.c > >>>>>>>> $(HOSTCC) $(HOSTCFLAGS) -g -o $@ $< > >>>>>>>> > >>>>>>>> +nocov-y += hweight.o > >>>>>>>> +noubsan-y += hweight.o > >>>>>>>> +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg)) > >>>>>>> > >>>>>>> Why not use clobbers in the asm to list the scratch registers? Is it > >>>>>>> that much expensive? > >>>>>> > >>>>>> The goal is to disturb the call sites as little as possible. There's > >>>>>> no point avoiding the scratch registers when no call is made (i.e. > >>>>>> when the POPCNT insn can be used). Taking away from the compiler 7 > >>>>>> out of 15 registers (that it can hold live data in) seems quite a > >>>>>> lot to me. > >>>>> > >>>>> IMO using -ffixed-reg for all those registers is even worse, as that > >>>>> prevents the generated code in hweight from using any of those, thus > >>>>> greatly limiting the amount of registers and likely making the > >>>>> generated code rely heavily on pushing an popping from the stack? > >>>> > >>>> Okay, that's the other side of the idea behind all this: Virtually no > >>>> hardware we run on will lack POPCNT support, hence the quality of > >>>> these fallback routines matters only the very old hardware, where we > >>>> likely don't perform optimally already anyway. > >>>> > >>>>> This also has the side effect to limiting the usage of popcnt to gcc, > >>>>> which IMO is also not ideal. > >>>> > >>>> Agreed. I don't know enough about clang to be able to think of > >>>> possible alternatives. In any event there's no change to current > >>>> behavior for hypervisors built with clang. > >>>> > >>>>> I also wondered, since the in-place asm before patching is a call > >>>>> instruction, wouldn't inline asm at build time already assume that the > >>>>> scratch registers are clobbered? > >>>> > >>>> That would imply the compiler peeks into the string literal of the > >>>> asm(). At least gcc doesn't, and even if it did it couldn't infer an > >>>> ABI from seeing a CALL insn. > >>> > >>> Please bear with me, but then I don't understand what Linux is doing > >>> in arch/x86/include/asm/arch_hweight.h. I see no clobbers there, > >>> neither it seems like the __sw_hweight{32/64} functions are built > >>> without the usage of the scratch registers. > >> > >> __sw_hweight{32,64} are implemented in assembly, avoiding most > >> scratch registers while pushing/popping the ones which do get > >> altered. > > > > Oh right, I was looking at lib/hweight.c instead of the arch one. > > > > Would you agree to use the no_caller_saved_registers attribute (which > > is available AFAICT for both gcc and clang) for generic_hweightXX and > > then remove the asm prefix code in favour of the defines for > > hweight{8/16}? > > At least for gcc no_caller_saved_registers isn't old enough to be > used unconditionally (nor is its companion -mgeneral-regs-only). > If you tell me it's fine to use unconditionally with clang, then > I can see about making this the preferred variant, with the > present one as a fallback. Hm, so my suggestion was bad, no_caller_saved_registers is only implemented starting clang 5, which is newer than the minimum we currently require (3.5). So apart from adding a clobber to the asm instance covering the scratch registers the only option I can see as viable is using a bunch of dummy global variables assigned to the registers we need to prevent the software generic_hweightXX functions from using, but that's ugly and will likely trigger warnings at least, since I'm not sure the compiler will find it safe to clobber a function call register with a global variable. Or adding a prolegue / epilogue to the call instruction in order to push / pop the relevant registers. None seems like a very good option IMO. I also assume that using no_caller_saved_registers when available or else keeping the current behavior is not an acceptable solution? FWIW, from a FreeBSD PoV I would be OK with that, as I don't think there are any supported targets with clang < 5. Thanks, Roger.
On 20.05.2020 19:18, Roger Pau Monné wrote: > I also assume that using no_caller_saved_registers when available or > else keeping the current behavior is not an acceptable solution? FWIW, > from a FreeBSD PoV I would be OK with that, as I don't think there are > any supported targets with clang < 5. By "current behavior" do you mean what the patch currently does (matching my earlier response that I may try going this route) or the unpatched upstream tree? Jan
On Fri, May 22, 2020 at 11:58:40AM +0200, Jan Beulich wrote: > On 20.05.2020 19:18, Roger Pau Monné wrote: > > I also assume that using no_caller_saved_registers when available or > > else keeping the current behavior is not an acceptable solution? FWIW, > > from a FreeBSD PoV I would be OK with that, as I don't think there are > > any supported targets with clang < 5. > > By "current behavior" do you mean what the patch currently > does (matching my earlier response that I may try going this > route) or the unpatched upstream tree? Sorry this wasn't clear. By current I meant what's in the upstream tree. IMO having both no_caller_saved_registers, the -ffixed-reg stuff and the current upstream no popcnt implementation is kind of too much divergence? My preference would be to add popcnt support only when no_caller_saved_registers is supported by the compiler, as I think that's the cleanest solution. This is purely a performance optimization, so it doesn't seem that bad to me to request a newish compiler in order to enable it. Roger.
On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: > This is faster than using the software implementation, and the insn is > available on all half-way recent hardware. Therefore convert > generic_hweight<N>() to out-of-line functions (without affecting Arm) > and use alternatives patching to replace the function calls. > > Note that the approach doesn#t work for clang, due to it not recognizing > -ffixed-*. I've been giving this a look, and I wonder if it would be fine to simply push and pop the scratch registers in the 'call' path of the alternative, as that won't require any specific compiler option. Thanks, Roger.
On 17/03/2023 11:22 am, Roger Pau Monné wrote: > On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: >> This is faster than using the software implementation, and the insn is >> available on all half-way recent hardware. Therefore convert >> generic_hweight<N>() to out-of-line functions (without affecting Arm) >> and use alternatives patching to replace the function calls. >> >> Note that the approach doesn#t work for clang, due to it not recognizing >> -ffixed-*. > I've been giving this a look, and I wonder if it would be fine to > simply push and pop the scratch registers in the 'call' path of the > alternative, as that won't require any specific compiler option. It's been a long while, and in that time I've learnt a lot more about performance, but my root objection to the approach taken here still stands - it is penalising the common case to optimise some pointless corner cases. Yes - on the call path, an extra push/pop pair (or few) to get temp registers is basically free. Right now, the generic_hweight() helpers are static inline in xen/bitops.h and this is nonsense. The absolute best they should be is extern inline in our new lib/ and I'll bet that the compilers stop inlining them there and then. Given new abi's like x86_64-v2 (which guarantees POPCNT as an available feature), it would be nice to arrange to use __builtin_popcnt() to let the compiler optimise to its hearts content, but outside of this case, it is actively damaging to try and optimise for memory operands or to inline the 8/16 case. So, for x86 specifically, we want: if ( CONFIG_POPCNT ) __builtin_popcnt() else ALT( "popcnt D -> a", "call arch_popcnt$N" ) and we can write arch_popcnt* in x86's lib/ and in assembly, because these are trivial enough and we do need to be careful with registers/etc. I'm not sure if a "+D" vs "D" will matter at the top level. Probably not, so it might be an easy way to get one tempt register. Other temp registers can come from push/pop. While we're at it, we should split hweight out of bitops and write the common header in such a way that it defaults to the generic implementations in lib/, and that will subsume the ARM header and also make this work on RISC-V for free. ~Andrew
On 17.03.2023 13:26, Andrew Cooper wrote: > On 17/03/2023 11:22 am, Roger Pau Monné wrote: >> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: >>> This is faster than using the software implementation, and the insn is >>> available on all half-way recent hardware. Therefore convert >>> generic_hweight<N>() to out-of-line functions (without affecting Arm) >>> and use alternatives patching to replace the function calls. >>> >>> Note that the approach doesn#t work for clang, due to it not recognizing >>> -ffixed-*. >> I've been giving this a look, and I wonder if it would be fine to >> simply push and pop the scratch registers in the 'call' path of the >> alternative, as that won't require any specific compiler option. Hmm, ... > It's been a long while, and in that time I've learnt a lot more about > performance, but my root objection to the approach taken here still > stands - it is penalising the common case to optimise some pointless > corner cases. > > Yes - on the call path, an extra push/pop pair (or few) to get temp > registers is basically free. ... what is "a few"? We'd need to push/pop all call-clobbered registers except %rax, i.e. a total of eight. I consider this too much. Unless, as you suggest further down, we wrote the fallback in assembly. Which I have to admit I'm surprised you propose when we strive to reduce the amount of assembly we have to maintain. > Right now, the generic_hweight() helpers are static inline in > xen/bitops.h and this is nonsense. The absolute best they should be is > extern inline in our new lib/ and I'll bet that the compilers stop > inlining them there and then. That would be an orthogonal move, wouldn't it? I'm also not really willing to go as far as calling the present way of it working "nonsense". I could be talked into doing such a transformation in a separate patch, but only if it is halfway certain that this won't again be effort invested just to then face further opposition (other maintainers may not agree with the movement, as we've seen for other remotely similar changes to "extern inline"). > Given new abi's like x86_64-v2 (which guarantees POPCNT as an available > feature), it would be nice to arrange to use __builtin_popcnt() to let > the compiler optimise to its hearts content, but outside of this case, > it is actively damaging to try and optimise for memory operands or to > inline the 8/16 case. > > So, for x86 specifically, we want: > > if ( CONFIG_POPCNT ) > __builtin_popcnt() > else > ALT( "popcnt D -> a", > "call arch_popcnt$N" ) > > and we can write arch_popcnt* in x86's lib/ and in assembly, because > these are trivial enough and we do need to be careful with registers/etc. How does x86_64-v2 matter here? And how does using __builtin_popcnt() help, which would use the "popcnt" insn only if we passed -march=popcnt (or any wider option implying this one) to the compiler? As to the 8-/16-bit case - I've already accepted to drop that special casing. The main reason I had it separate was because the generic code also has them special cased. In any event I would like to make all agreed upon changes in one go, hence why I didn't submit a new version yet. > I'm not sure if a "+D" vs "D" will matter at the top level. Probably > not, so it might be an easy way to get one tempt register. Other temp > registers can come from push/pop. > > > While we're at it, we should split hweight out of bitops and write the > common header in such a way that it defaults to the generic > implementations in lib/, and that will subsume the ARM header and also > make this work on RISC-V for free. Yet another independent change you're asking for. I've taken note of both of these separate requests, but without any guarantee (yet) that I'm going to actually carry them out. Jan
On Mon, Mar 20, 2023 at 10:48:45AM +0100, Jan Beulich wrote: > On 17.03.2023 13:26, Andrew Cooper wrote: > > On 17/03/2023 11:22 am, Roger Pau Monné wrote: > >> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: > >>> This is faster than using the software implementation, and the insn is > >>> available on all half-way recent hardware. Therefore convert > >>> generic_hweight<N>() to out-of-line functions (without affecting Arm) > >>> and use alternatives patching to replace the function calls. > >>> > >>> Note that the approach doesn#t work for clang, due to it not recognizing > >>> -ffixed-*. > >> I've been giving this a look, and I wonder if it would be fine to > >> simply push and pop the scratch registers in the 'call' path of the > >> alternative, as that won't require any specific compiler option. > > Hmm, ... > > > It's been a long while, and in that time I've learnt a lot more about > > performance, but my root objection to the approach taken here still > > stands - it is penalising the common case to optimise some pointless > > corner cases. > > > > Yes - on the call path, an extra push/pop pair (or few) to get temp > > registers is basically free. > > ... what is "a few"? We'd need to push/pop all call-clobbered registers > except %rax, i.e. a total of eight. I consider this too much. Unless, > as you suggest further down, we wrote the fallback in assembly. Which I > have to admit I'm surprised you propose when we strive to reduce the > amount of assembly we have to maintain. AMD added popcnt in 2007 and Intel in 2008. While we shouldn't mandate popcnt support, I think we also shouldn't overly worry about the non-popcnt path. Also, how can you assert that the code generated without the scratch registers being usable won't be worse than the penalty of pushing and popping such registers on the stack and letting the routines use all registers freely? I very much prefer to have a non-optimal non-popcnt path, but have popcnt support for both gcc and clang, and without requiring any compiler options. Thanks, Roger.
On 21.03.2023 15:57, Roger Pau Monné wrote: > On Mon, Mar 20, 2023 at 10:48:45AM +0100, Jan Beulich wrote: >> On 17.03.2023 13:26, Andrew Cooper wrote: >>> On 17/03/2023 11:22 am, Roger Pau Monné wrote: >>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: >>>>> This is faster than using the software implementation, and the insn is >>>>> available on all half-way recent hardware. Therefore convert >>>>> generic_hweight<N>() to out-of-line functions (without affecting Arm) >>>>> and use alternatives patching to replace the function calls. >>>>> >>>>> Note that the approach doesn#t work for clang, due to it not recognizing >>>>> -ffixed-*. >>>> I've been giving this a look, and I wonder if it would be fine to >>>> simply push and pop the scratch registers in the 'call' path of the >>>> alternative, as that won't require any specific compiler option. >> >> Hmm, ... >> >>> It's been a long while, and in that time I've learnt a lot more about >>> performance, but my root objection to the approach taken here still >>> stands - it is penalising the common case to optimise some pointless >>> corner cases. >>> >>> Yes - on the call path, an extra push/pop pair (or few) to get temp >>> registers is basically free. >> >> ... what is "a few"? We'd need to push/pop all call-clobbered registers >> except %rax, i.e. a total of eight. I consider this too much. Unless, >> as you suggest further down, we wrote the fallback in assembly. Which I >> have to admit I'm surprised you propose when we strive to reduce the >> amount of assembly we have to maintain. > > AMD added popcnt in 2007 and Intel in 2008. While we shouldn't > mandate popcnt support, I think we also shouldn't overly worry about > the non-popcnt path. We agree here. > Also, how can you assert that the code generated without the scratch > registers being usable won't be worse than the penalty of pushing and > popping such registers on the stack and letting the routines use all > registers freely? Irrespective of the -ffixed-* the compiler can make itself sufficiently many registers available to use, by preserving just the ones it actually uses. So that's pretty certainly not more PUSH/POP than when we would blindly preserve all which may need preserving (no matter whether they're actually used). Yet then both this question and ... > I very much prefer to have a non-optimal non-popcnt path, but have > popcnt support for both gcc and clang, and without requiring any > compiler options. ... this makes me wonder whether we're thinking of fundamentally different generated code that we would end up with. Since the (apparently agreed upon) goal is to optimize for the "popcnt available" case, mainline code should be not significantly longer that what's needed for the single instruction itself, or alternatively a CALL insn. Adding pushes/pops of all call clobbered registers around the CALL would grow mainline code too much (for my taste), i.e. irrespective of these PUSH/POP all getting NOP-ed out by alternatives patching. (As an aside I consider fiddling with the stack pointer in inline asm() risky, and hence I would prefer to avoid such whenever possible. Yes, it can be written so it's independent of what the compiler thinks the stack pointer points at, but such constructs are fragile as soon as one wants to change them a little later on.) Are you perhaps thinking of indeed having the PUSH/POP in mainline code? Or some entirely different model? What I could see us doing to accommodate Clang is to have wrappers around the actual functions which do as you suggest and preserve all relevant registers, no matter whether they're used. That'll still be assembly code to maintain, but imo less of a concern than in Andrew's model of writing hweight<N>() themselves in assembly (provided I understood him correctly), for being purely mechanical operations. Jan
On 21.03.23 16:35, Jan Beulich wrote: > On 21.03.2023 15:57, Roger Pau Monné wrote: >> On Mon, Mar 20, 2023 at 10:48:45AM +0100, Jan Beulich wrote: >>> On 17.03.2023 13:26, Andrew Cooper wrote: >>>> On 17/03/2023 11:22 am, Roger Pau Monné wrote: >>>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: >>>>>> This is faster than using the software implementation, and the insn is >>>>>> available on all half-way recent hardware. Therefore convert >>>>>> generic_hweight<N>() to out-of-line functions (without affecting Arm) >>>>>> and use alternatives patching to replace the function calls. >>>>>> >>>>>> Note that the approach doesn#t work for clang, due to it not recognizing >>>>>> -ffixed-*. >>>>> I've been giving this a look, and I wonder if it would be fine to >>>>> simply push and pop the scratch registers in the 'call' path of the >>>>> alternative, as that won't require any specific compiler option. >>> >>> Hmm, ... >>> >>>> It's been a long while, and in that time I've learnt a lot more about >>>> performance, but my root objection to the approach taken here still >>>> stands - it is penalising the common case to optimise some pointless >>>> corner cases. >>>> >>>> Yes - on the call path, an extra push/pop pair (or few) to get temp >>>> registers is basically free. >>> >>> ... what is "a few"? We'd need to push/pop all call-clobbered registers >>> except %rax, i.e. a total of eight. I consider this too much. Unless, >>> as you suggest further down, we wrote the fallback in assembly. Which I >>> have to admit I'm surprised you propose when we strive to reduce the >>> amount of assembly we have to maintain. >> >> AMD added popcnt in 2007 and Intel in 2008. While we shouldn't >> mandate popcnt support, I think we also shouldn't overly worry about >> the non-popcnt path. > > We agree here. > >> Also, how can you assert that the code generated without the scratch >> registers being usable won't be worse than the penalty of pushing and >> popping such registers on the stack and letting the routines use all >> registers freely? > > Irrespective of the -ffixed-* the compiler can make itself sufficiently > many registers available to use, by preserving just the ones it actually > uses. So that's pretty certainly not more PUSH/POP than when we would > blindly preserve all which may need preserving (no matter whether > they're actually used). > > Yet then both this question and ... > >> I very much prefer to have a non-optimal non-popcnt path, but have >> popcnt support for both gcc and clang, and without requiring any >> compiler options. > > ... this makes me wonder whether we're thinking of fundamentally > different generated code that we would end up with. Since the > (apparently agreed upon) goal is to optimize for the "popcnt > available" case, mainline code should be not significantly longer that > what's needed for the single instruction itself, or alternatively a > CALL insn. Adding pushes/pops of all call clobbered registers around > the CALL would grow mainline code too much (for my taste), i.e. > irrespective of these PUSH/POP all getting NOP-ed out by alternatives > patching. (As an aside I consider fiddling with the stack pointer in > inline asm() risky, and hence I would prefer to avoid such whenever > possible. Yes, it can be written so it's independent of what the > compiler thinks the stack pointer points at, but such constructs are > fragile as soon as one wants to change them a little later on.) > > Are you perhaps thinking of indeed having the PUSH/POP in mainline > code? Or some entirely different model? > > What I could see us doing to accommodate Clang is to have wrappers > around the actual functions which do as you suggest and preserve all > relevant registers, no matter whether they're used. You could just copy the PV_CALLEE_SAVE_REGS_THUNK() macro from the Linux kernel, which is doing exactly that. Juergen
On Tue, Mar 21, 2023 at 04:35:54PM +0100, Jan Beulich wrote: > On 21.03.2023 15:57, Roger Pau Monné wrote: > > On Mon, Mar 20, 2023 at 10:48:45AM +0100, Jan Beulich wrote: > >> On 17.03.2023 13:26, Andrew Cooper wrote: > >>> On 17/03/2023 11:22 am, Roger Pau Monné wrote: > >>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: > >>>>> This is faster than using the software implementation, and the insn is > >>>>> available on all half-way recent hardware. Therefore convert > >>>>> generic_hweight<N>() to out-of-line functions (without affecting Arm) > >>>>> and use alternatives patching to replace the function calls. > >>>>> > >>>>> Note that the approach doesn#t work for clang, due to it not recognizing > >>>>> -ffixed-*. > >>>> I've been giving this a look, and I wonder if it would be fine to > >>>> simply push and pop the scratch registers in the 'call' path of the > >>>> alternative, as that won't require any specific compiler option. > >> > >> Hmm, ... > >> > >>> It's been a long while, and in that time I've learnt a lot more about > >>> performance, but my root objection to the approach taken here still > >>> stands - it is penalising the common case to optimise some pointless > >>> corner cases. > >>> > >>> Yes - on the call path, an extra push/pop pair (or few) to get temp > >>> registers is basically free. > >> > >> ... what is "a few"? We'd need to push/pop all call-clobbered registers > >> except %rax, i.e. a total of eight. I consider this too much. Unless, > >> as you suggest further down, we wrote the fallback in assembly. Which I > >> have to admit I'm surprised you propose when we strive to reduce the > >> amount of assembly we have to maintain. > > > > AMD added popcnt in 2007 and Intel in 2008. While we shouldn't > > mandate popcnt support, I think we also shouldn't overly worry about > > the non-popcnt path. > > We agree here. > > > Also, how can you assert that the code generated without the scratch > > registers being usable won't be worse than the penalty of pushing and > > popping such registers on the stack and letting the routines use all > > registers freely? > > Irrespective of the -ffixed-* the compiler can make itself sufficiently > many registers available to use, by preserving just the ones it actually > uses. So that's pretty certainly not more PUSH/POP than when we would > blindly preserve all which may need preserving (no matter whether > they're actually used). > > Yet then both this question and ... > > > I very much prefer to have a non-optimal non-popcnt path, but have > > popcnt support for both gcc and clang, and without requiring any > > compiler options. > > ... this makes me wonder whether we're thinking of fundamentally > different generated code that we would end up with. Since the > (apparently agreed upon) goal is to optimize for the "popcnt > available" case, mainline code should be not significantly longer that > what's needed for the single instruction itself, or alternatively a > CALL insn. Adding pushes/pops of all call clobbered registers around > the CALL would grow mainline code too much (for my taste), i.e. > irrespective of these PUSH/POP all getting NOP-ed out by alternatives > patching. (As an aside I consider fiddling with the stack pointer in > inline asm() risky, and hence I would prefer to avoid such whenever > possible. Is this because we are likely to not end up with a proper trace if we mess up with the stack pointer before a function call? I would like to better understand your concerns with the stack pointer and inline asm(). Other options would be using no_caller_saved_registers, but that's not available in all supported gcc versions, likely the same with clang, but I wouldn't mind upping the minimal clang version. What about allocating the size of the registers that need saving as an on-stack variable visible to the compiler and then moving to/from there in the inline asm()? > Yes, it can be written so it's independent of what the > compiler thinks the stack pointer points at, but such constructs are > fragile as soon as one wants to change them a little later on.) > > Are you perhaps thinking of indeed having the PUSH/POP in mainline > code? Or some entirely different model? > > What I could see us doing to accommodate Clang is to have wrappers > around the actual functions which do as you suggest and preserve all > relevant registers, no matter whether they're used. That'll still be > assembly code to maintain, but imo less of a concern than in Andrew's > model of writing hweight<N>() themselves in assembly (provided I > understood him correctly), for being purely mechanical operations. If possible I would prefer to use the same model for both gcc and clang, or else things tend to get more complicated than strictly needed. I also wonder whether we could also get away without disabling of coverage and ubsan for the hweight object file. But I assume as long ass we do the function call in inline asm() (and thus kind of hidden from the compiler) we need to disable any instrumentation because the changed function context. I don't think there's anyway we can manually add the function call prefix/suffix in the inline asm()? Thanks, Roger.
On 21.03.2023 17:31, Roger Pau Monné wrote: > On Tue, Mar 21, 2023 at 04:35:54PM +0100, Jan Beulich wrote: >> On 21.03.2023 15:57, Roger Pau Monné wrote: >>> On Mon, Mar 20, 2023 at 10:48:45AM +0100, Jan Beulich wrote: >>>> On 17.03.2023 13:26, Andrew Cooper wrote: >>>>> On 17/03/2023 11:22 am, Roger Pau Monné wrote: >>>>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: >>>>>>> This is faster than using the software implementation, and the insn is >>>>>>> available on all half-way recent hardware. Therefore convert >>>>>>> generic_hweight<N>() to out-of-line functions (without affecting Arm) >>>>>>> and use alternatives patching to replace the function calls. >>>>>>> >>>>>>> Note that the approach doesn#t work for clang, due to it not recognizing >>>>>>> -ffixed-*. >>>>>> I've been giving this a look, and I wonder if it would be fine to >>>>>> simply push and pop the scratch registers in the 'call' path of the >>>>>> alternative, as that won't require any specific compiler option. >>>> >>>> Hmm, ... >>>> >>>>> It's been a long while, and in that time I've learnt a lot more about >>>>> performance, but my root objection to the approach taken here still >>>>> stands - it is penalising the common case to optimise some pointless >>>>> corner cases. >>>>> >>>>> Yes - on the call path, an extra push/pop pair (or few) to get temp >>>>> registers is basically free. >>>> >>>> ... what is "a few"? We'd need to push/pop all call-clobbered registers >>>> except %rax, i.e. a total of eight. I consider this too much. Unless, >>>> as you suggest further down, we wrote the fallback in assembly. Which I >>>> have to admit I'm surprised you propose when we strive to reduce the >>>> amount of assembly we have to maintain. >>> >>> AMD added popcnt in 2007 and Intel in 2008. While we shouldn't >>> mandate popcnt support, I think we also shouldn't overly worry about >>> the non-popcnt path. >> >> We agree here. >> >>> Also, how can you assert that the code generated without the scratch >>> registers being usable won't be worse than the penalty of pushing and >>> popping such registers on the stack and letting the routines use all >>> registers freely? >> >> Irrespective of the -ffixed-* the compiler can make itself sufficiently >> many registers available to use, by preserving just the ones it actually >> uses. So that's pretty certainly not more PUSH/POP than when we would >> blindly preserve all which may need preserving (no matter whether >> they're actually used). >> >> Yet then both this question and ... >> >>> I very much prefer to have a non-optimal non-popcnt path, but have >>> popcnt support for both gcc and clang, and without requiring any >>> compiler options. >> >> ... this makes me wonder whether we're thinking of fundamentally >> different generated code that we would end up with. Since the >> (apparently agreed upon) goal is to optimize for the "popcnt >> available" case, mainline code should be not significantly longer that >> what's needed for the single instruction itself, or alternatively a >> CALL insn. Adding pushes/pops of all call clobbered registers around >> the CALL would grow mainline code too much (for my taste), i.e. >> irrespective of these PUSH/POP all getting NOP-ed out by alternatives >> patching. (As an aside I consider fiddling with the stack pointer in >> inline asm() risky, and hence I would prefer to avoid such whenever >> possible. > > Is this because we are likely to not end up with a proper trace if we > mess up with the stack pointer before a function call? That's a related issue, but not what I was thinking about. > I would like > to better understand your concerns with the stack pointer and inline > asm(). You can't use local variables anymore with "m" or "rm" constraints, as they might be accessed via %rsp. > Other options would be using no_caller_saved_registers, but that's not > available in all supported gcc versions, likely the same with clang, > but I wouldn't mind upping the minimal clang version. Right, you did suggest using this attribute before. But we continue to support older gcc. > What about allocating the size of the registers that need saving as an > on-stack variable visible to the compiler and then moving to/from > there in the inline asm()? That would address the fiddling-with-%rsp concern, but would further grow mainline code size. >> Yes, it can be written so it's independent of what the >> compiler thinks the stack pointer points at, but such constructs are >> fragile as soon as one wants to change them a little later on.) >> >> Are you perhaps thinking of indeed having the PUSH/POP in mainline >> code? Or some entirely different model? >> >> What I could see us doing to accommodate Clang is to have wrappers >> around the actual functions which do as you suggest and preserve all >> relevant registers, no matter whether they're used. That'll still be >> assembly code to maintain, but imo less of a concern than in Andrew's >> model of writing hweight<N>() themselves in assembly (provided I >> understood him correctly), for being purely mechanical operations. > > If possible I would prefer to use the same model for both gcc and > clang, or else things tend to get more complicated than strictly > needed. We can always decide to accept the extra overhead even with gcc. > I also wonder whether we could also get away without disabling of > coverage and ubsan for the hweight object file. But I assume as long > ass we do the function call in inline asm() (and thus kind of hidden > from the compiler) we need to disable any instrumentation because the > changed function context. I don't think there's anyway we can > manually add the function call prefix/suffix in the inline asm()? I don't know whether that would be possible (and portable across versions). But what I'm more concerned about is that such functions may also end up clobbering certain call-clobbered registers. (Note that when writing these in assembly, as suggested by Andrew, no such hooks would be used either.) Jan
On Tue, Mar 21, 2023 at 05:41:52PM +0100, Jan Beulich wrote: > On 21.03.2023 17:31, Roger Pau Monné wrote: > > On Tue, Mar 21, 2023 at 04:35:54PM +0100, Jan Beulich wrote: > >> On 21.03.2023 15:57, Roger Pau Monné wrote: > >>> On Mon, Mar 20, 2023 at 10:48:45AM +0100, Jan Beulich wrote: > >>>> On 17.03.2023 13:26, Andrew Cooper wrote: > >>>>> On 17/03/2023 11:22 am, Roger Pau Monné wrote: > >>>>>> On Mon, Jul 15, 2019 at 02:39:04PM +0000, Jan Beulich wrote: > >>>>>>> This is faster than using the software implementation, and the insn is > >>>>>>> available on all half-way recent hardware. Therefore convert > >>>>>>> generic_hweight<N>() to out-of-line functions (without affecting Arm) > >>>>>>> and use alternatives patching to replace the function calls. > >>>>>>> > >>>>>>> Note that the approach doesn#t work for clang, due to it not recognizing > >>>>>>> -ffixed-*. > >>>>>> I've been giving this a look, and I wonder if it would be fine to > >>>>>> simply push and pop the scratch registers in the 'call' path of the > >>>>>> alternative, as that won't require any specific compiler option. > >>>> > >>>> Hmm, ... > >>>> > >>>>> It's been a long while, and in that time I've learnt a lot more about > >>>>> performance, but my root objection to the approach taken here still > >>>>> stands - it is penalising the common case to optimise some pointless > >>>>> corner cases. > >>>>> > >>>>> Yes - on the call path, an extra push/pop pair (or few) to get temp > >>>>> registers is basically free. > >>>> > >>>> ... what is "a few"? We'd need to push/pop all call-clobbered registers > >>>> except %rax, i.e. a total of eight. I consider this too much. Unless, > >>>> as you suggest further down, we wrote the fallback in assembly. Which I > >>>> have to admit I'm surprised you propose when we strive to reduce the > >>>> amount of assembly we have to maintain. > >>> > >>> AMD added popcnt in 2007 and Intel in 2008. While we shouldn't > >>> mandate popcnt support, I think we also shouldn't overly worry about > >>> the non-popcnt path. > >> > >> We agree here. > >> > >>> Also, how can you assert that the code generated without the scratch > >>> registers being usable won't be worse than the penalty of pushing and > >>> popping such registers on the stack and letting the routines use all > >>> registers freely? > >> > >> Irrespective of the -ffixed-* the compiler can make itself sufficiently > >> many registers available to use, by preserving just the ones it actually > >> uses. So that's pretty certainly not more PUSH/POP than when we would > >> blindly preserve all which may need preserving (no matter whether > >> they're actually used). > >> > >> Yet then both this question and ... > >> > >>> I very much prefer to have a non-optimal non-popcnt path, but have > >>> popcnt support for both gcc and clang, and without requiring any > >>> compiler options. > >> > >> ... this makes me wonder whether we're thinking of fundamentally > >> different generated code that we would end up with. Since the > >> (apparently agreed upon) goal is to optimize for the "popcnt > >> available" case, mainline code should be not significantly longer that > >> what's needed for the single instruction itself, or alternatively a > >> CALL insn. Adding pushes/pops of all call clobbered registers around > >> the CALL would grow mainline code too much (for my taste), i.e. > >> irrespective of these PUSH/POP all getting NOP-ed out by alternatives > >> patching. (As an aside I consider fiddling with the stack pointer in > >> inline asm() risky, and hence I would prefer to avoid such whenever > >> possible. > > > > Is this because we are likely to not end up with a proper trace if we > > mess up with the stack pointer before a function call? > > That's a related issue, but not what I was thinking about. > > > I would like > > to better understand your concerns with the stack pointer and inline > > asm(). > > You can't use local variables anymore with "m" or "rm" constraints, as > they might be accessed via %rsp. > > > Other options would be using no_caller_saved_registers, but that's not > > available in all supported gcc versions, likely the same with clang, > > but I wouldn't mind upping the minimal clang version. > > Right, you did suggest using this attribute before. But we continue to > support older gcc. FWIW, I would be fine with not enabling the optimization if the attribute is not available, but then again this means adding more logic. At the end this is just an optimization, so I think it's fine to request a version of gcc greater than the supported baseline. > > What about allocating the size of the registers that need saving as an > > on-stack variable visible to the compiler and then moving to/from > > there in the inline asm()? > > That would address the fiddling-with-%rsp concern, but would further > grow mainline code size. I'm fine with such growth, unless you can prove the growth in code size shadows the performance win from the usage of popcnt. > >> Yes, it can be written so it's independent of what the > >> compiler thinks the stack pointer points at, but such constructs are > >> fragile as soon as one wants to change them a little later on.) > >> > >> Are you perhaps thinking of indeed having the PUSH/POP in mainline > >> code? Or some entirely different model? > >> > >> What I could see us doing to accommodate Clang is to have wrappers > >> around the actual functions which do as you suggest and preserve all > >> relevant registers, no matter whether they're used. That'll still be > >> assembly code to maintain, but imo less of a concern than in Andrew's > >> model of writing hweight<N>() themselves in assembly (provided I > >> understood him correctly), for being purely mechanical operations. > > > > If possible I would prefer to use the same model for both gcc and > > clang, or else things tend to get more complicated than strictly > > needed. > > We can always decide to accept the extra overhead even with gcc. > > > I also wonder whether we could also get away without disabling of > > coverage and ubsan for the hweight object file. But I assume as long > > ass we do the function call in inline asm() (and thus kind of hidden > > from the compiler) we need to disable any instrumentation because the > > changed function context. I don't think there's anyway we can > > manually add the function call prefix/suffix in the inline asm()? > > I don't know whether that would be possible (and portable across > versions). But what I'm more concerned about is that such functions > may also end up clobbering certain call-clobbered registers. (Note > that when writing these in assembly, as suggested by Andrew, no such > hooks would be used either.) Right, we would just pick the Linux assembly for those helpers. I have to admit it feels weird, because I generally try to avoid growing our usage of assembly, and hence this would seem like a step backwards towards that goal. Thanks, Roger.
--- a/xen/arch/x86/Makefile +++ b/xen/arch/x86/Makefile @@ -31,6 +31,10 @@ obj-y += emul-i8254.o obj-y += extable.o obj-y += flushtlb.o obj-$(CONFIG_CRASH_DEBUG) += gdbstub.o +# clang doesn't appear to know of -ffixed-* +hweight-$(gcc) := hweight.o +hweight-$(clang) := +obj-y += $(hweight-y) obj-y += hypercall.o obj-y += i387.o obj-y += i8259.o @@ -251,6 +255,10 @@ boot/mkelf32: boot/mkelf32.c efi/mkreloc: efi/mkreloc.c $(HOSTCC) $(HOSTCFLAGS) -g -o $@ $< +nocov-y += hweight.o +noubsan-y += hweight.o +hweight.o: CFLAGS += $(foreach reg,cx dx si 8 9 10 11,-ffixed-r$(reg)) + .PHONY: clean clean:: rm -f asm-offsets.s *.lds boot/*.o boot/*~ boot/core boot/mkelf32 --- /dev/null +++ b/xen/arch/x86/hweight.c @@ -0,0 +1,21 @@ +#define generic_hweight64 _hweight64 +#define generic_hweight32 _hweight32 +#define generic_hweight16 _hweight16 +#define generic_hweight8 _hweight8 + +#include <xen/compiler.h> + +#undef inline +#define inline always_inline + +#include <xen/bitops.h> + +#undef generic_hweight8 +#undef generic_hweight16 +#undef generic_hweight32 +#undef generic_hweight64 + +unsigned int generic_hweight8 (unsigned int x) { return _hweight8 (x); } +unsigned int generic_hweight16(unsigned int x) { return _hweight16(x); } +unsigned int generic_hweight32(unsigned int x) { return _hweight32(x); } +unsigned int generic_hweight64(uint64_t x) { return _hweight64(x); } --- a/xen/include/asm-x86/bitops.h +++ b/xen/include/asm-x86/bitops.h @@ -475,9 +475,36 @@ static inline int fls(unsigned int x) * * The Hamming Weight of a number is the total number of bits set in it. */ +#ifndef __clang__ +/* POPCNT encodings with %{r,e}di input and %{r,e}ax output: */ +#define POPCNT_64 ".byte 0xF3, 0x48, 0x0F, 0xB8, 0xC7" +#define POPCNT_32 ".byte 0xF3, 0x0F, 0xB8, 0xC7" + +#define hweight_(n, x, insn, setup, cout, cin) ({ \ + unsigned int res_; \ + /* \ + * For the function call the POPCNT input register needs to be marked \ + * modified as well. Set up a local variable of appropriate type \ + * for this purpose. \ + */ \ + typeof((uint##n##_t)(x) + 0U) val_ = (x); \ + alternative_io(setup "; call generic_hweight" #n, \ + insn, X86_FEATURE_POPCNT, \ + ASM_OUTPUT2([res] "=a" (res_), [val] cout (val_)), \ + [src] cin (val_)); \ + res_; \ +}) +#define hweight64(x) hweight_(64, x, POPCNT_64, "", "+D", "g") +#define hweight32(x) hweight_(32, x, POPCNT_32, "", "+D", "g") +#define hweight16(x) hweight_(16, x, "movzwl %w[src], %[val]; " POPCNT_32, \ + "mov %[src], %[val]", "=&D", "rm") +#define hweight8(x) hweight_( 8, x, "movzbl %b[src], %[val]; " POPCNT_32, \ + "mov %[src], %[val]", "=&D", "rm") +#else #define hweight64(x) generic_hweight64(x) #define hweight32(x) generic_hweight32(x) #define hweight16(x) generic_hweight16(x) #define hweight8(x) generic_hweight8(x) +#endif #endif /* _X86_BITOPS_H */