@@ -6,6 +6,7 @@
*/
#include <asm/alternative.h>
+#include <asm/asm_defns.h>
#include <asm/cpufeatureset.h>
/*
@@ -475,4 +476,26 @@ static always_inline unsigned int arch_flsl(unsigned long x)
}
#define arch_flsl arch_flsl
+unsigned int arch_generic_hweightl(unsigned long x);
+
+static always_inline unsigned int arch_hweightl(unsigned long x)
+{
+ unsigned int r;
+
+ /*
+ * arch_generic_hweightl() is written in ASM in order to preserve all
+ * registers, as the compiler can't see the call.
+ *
+ * This limits the POPCNT instruction to using the same ABI as a function
+ * call (input in %rdi, output in %eax) but that's fine.
+ */
+ alternative_io("call arch_generic_hweightl",
+ "popcnt %[val], %q[res]", X86_FEATURE_POPCNT,
+ ASM_OUTPUT2([res] "=a" (r) ASM_CALL_CONSTRAINT),
+ [val] "D" (x));
+
+ return r;
+}
+#define arch_hweightl arch_hweightl
+
#endif /* _X86_BITOPS_H */
@@ -36,6 +36,7 @@ lib-y += strtol.o
lib-y += strtoll.o
lib-y += strtoul.o
lib-y += strtoull.o
+lib-$(CONFIG_X86) += x86-generic-hweightl.o
lib-$(CONFIG_X86) += xxhash32.o
lib-$(CONFIG_X86) += xxhash64.o
new file mode 100644
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <xen/bitops.h>
+#include <xen/init.h>
+#include <xen/self-tests.h>
+
+/*
+ * An implementation of generic_hweightl() used on hardware without the POPCNT
+ * instruction.
+ *
+ * This function is called from within an ALTERNATIVE in arch_hweightl().
+ * i.e. behind the back of the compiler. Therefore all registers are callee
+ * preserved.
+ *
+ * The ASM is what GCC-12 emits for generic_hweightl() in a release build of
+ * Xen, with spilling of %rdi/%rdx to preserve the callers registers.
+ *
+ * Note: When we can use __attribute__((no_caller_saved_registers))
+ * unconditionally (GCC 7, Clang 5), we can implement this in plain C.
+ */
+asm (
+ ".type arch_generic_hweightl, STT_FUNC\n\t"
+ ".globl arch_generic_hweightl\n\t"
+ ".hidden arch_generic_hweightl\n\t"
+ ".balign " STR(CONFIG_FUNCTION_ALIGNMENT) ", 0x90\n" /* CODE_FILL */
+ "arch_generic_hweightl:\n\t"
+
+ "push %rdi\n\t"
+ "push %rdx\n\t"
+
+ "movabs $0x5555555555555555, %rdx\n\t"
+ "mov %rdi, %rax\n\t"
+ "shr $1, %rax\n\t"
+ "and %rdx, %rax\n\t"
+ "sub %rax, %rdi\n\t"
+ "movabs $0x3333333333333333, %rax\n\t"
+ "mov %rdi, %rdx\n\t"
+ "shr $2, %rdi\n\t"
+ "and %rax, %rdx\n\t"
+ "and %rax, %rdi\n\t"
+ "add %rdi, %rdx\n\t"
+ "mov %rdx, %rax\n\t"
+ "shr $4, %rax\n\t"
+ "add %rdx, %rax\n\t"
+ "movabs $0x0f0f0f0f0f0f0f0f, %rdx\n\t"
+ "and %rdx, %rax\n\t"
+ "movabs $0x0101010101010101, %rdx\n\t"
+ "imul %rdx, %rax\n\t"
+ "shr $" STR(BITS_PER_LONG) "- 8, %rax\n\t"
+
+ "pop %rdx\n\t"
+ "pop %rdi\n\t"
+
+ "ret\n\t"
+
+ ".size arch_generic_hweightl, . - arch_generic_hweightl\n\t"
+);
+
+#ifdef CONFIG_SELF_TESTS
+static void __init __constructor test_arch_generic_hweightl(void)
+{
+ RUNTIME_CHECK(arch_generic_hweightl, 0, 0);
+ RUNTIME_CHECK(arch_generic_hweightl, 1, 1);
+ RUNTIME_CHECK(arch_generic_hweightl, 3, 2);
+ RUNTIME_CHECK(arch_generic_hweightl, 7, 3);
+ RUNTIME_CHECK(arch_generic_hweightl, 0xff, 8);
+
+ RUNTIME_CHECK(arch_generic_hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2);
+ RUNTIME_CHECK(arch_generic_hweightl, -1UL, BITS_PER_LONG);
+}
+#endif
It has existed in x86 CPUs since 2008, so we're only 16 years late adding support. With all the other scafolding in place, implement arch_hweightl() for x86. The only complication is that the call to arch_generic_hweightl() is behind the compilers back. Address this by writing it in ASM and ensure that it preserves all registers. Copy the code generation from generic_hweightl(). It's not a complicated algorithm, and is easy to regenerate if needs be, but cover it with the same unit tests as test_generic_hweightl() just for piece of mind. Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com> --- CC: Jan Beulich <JBeulich@suse.com> CC: Roger Pau Monné <roger.pau@citrix.com> v3: * Reinstate dropped CONFIG_SELF_TESTS * Leave grep fodder for CODE_FILL until we can find a nicer way of doing this. v2: * Fix MISRA 8.2 (parameter name) and 8.5 (single declaration) regressions. * Rename {arch->x86}-generic-hweightl.{S->c} * Adjust ASM formating --- xen/arch/x86/include/asm/bitops.h | 23 ++++++++++ xen/lib/Makefile | 1 + xen/lib/x86-generic-hweightl.c | 71 +++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 xen/lib/x86-generic-hweightl.c