diff mbox series

[v3,5/5] x86/bitops: Use the POPCNT instruction when available

Message ID 20240904225530.3888315-6-andrew.cooper3@citrix.com (mailing list archive)
State New
Headers show
Series xen/bitops: hweight() cleanup/improvements | expand

Commit Message

Andrew Cooper Sept. 4, 2024, 10:55 p.m. UTC
It has existed in x86 CPUs since 2008, so we're only 16 years late adding
support.  With all the other scafolding in place, implement arch_hweightl()
for x86.

The only complication is that the call to arch_generic_hweightl() is behind
the compilers back.  Address this by writing it in ASM and ensure that it
preserves all registers.

Copy the code generation from generic_hweightl().  It's not a complicated
algorithm, and is easy to regenerate if needs be, but cover it with the same
unit tests as test_generic_hweightl() just for piece of mind.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>

v3:
 * Reinstate dropped CONFIG_SELF_TESTS
 * Leave grep fodder for CODE_FILL until we can find a nicer way of doing this.

v2:
 * Fix MISRA 8.2 (parameter name) and 8.5 (single declaration) regressions.
 * Rename {arch->x86}-generic-hweightl.{S->c}
 * Adjust ASM formating
---
 xen/arch/x86/include/asm/bitops.h | 23 ++++++++++
 xen/lib/Makefile                  |  1 +
 xen/lib/x86-generic-hweightl.c    | 71 +++++++++++++++++++++++++++++++
 3 files changed, 95 insertions(+)
 create mode 100644 xen/lib/x86-generic-hweightl.c

Comments

Jan Beulich Sept. 5, 2024, 9:55 a.m. UTC | #1
On 05.09.2024 00:55, Andrew Cooper wrote:
> It has existed in x86 CPUs since 2008, so we're only 16 years late adding
> support.  With all the other scafolding in place, implement arch_hweightl()
> for x86.
> 
> The only complication is that the call to arch_generic_hweightl() is behind
> the compilers back.  Address this by writing it in ASM and ensure that it
> preserves all registers.
> 
> Copy the code generation from generic_hweightl().  It's not a complicated
> algorithm, and is easy to regenerate if needs be, but cover it with the same
> unit tests as test_generic_hweightl() just for piece of mind.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>

Acked-by: Jan Beulich <jbeulich@suse.com>
diff mbox series

Patch

diff --git a/xen/arch/x86/include/asm/bitops.h b/xen/arch/x86/include/asm/bitops.h
index 642d8e58b288..39e37f1cbe55 100644
--- a/xen/arch/x86/include/asm/bitops.h
+++ b/xen/arch/x86/include/asm/bitops.h
@@ -6,6 +6,7 @@ 
  */
 
 #include <asm/alternative.h>
+#include <asm/asm_defns.h>
 #include <asm/cpufeatureset.h>
 
 /*
@@ -475,4 +476,26 @@  static always_inline unsigned int arch_flsl(unsigned long x)
 }
 #define arch_flsl arch_flsl
 
+unsigned int arch_generic_hweightl(unsigned long x);
+
+static always_inline unsigned int arch_hweightl(unsigned long x)
+{
+    unsigned int r;
+
+    /*
+     * arch_generic_hweightl() is written in ASM in order to preserve all
+     * registers, as the compiler can't see the call.
+     *
+     * This limits the POPCNT instruction to using the same ABI as a function
+     * call (input in %rdi, output in %eax) but that's fine.
+     */
+    alternative_io("call arch_generic_hweightl",
+                   "popcnt %[val], %q[res]", X86_FEATURE_POPCNT,
+                   ASM_OUTPUT2([res] "=a" (r) ASM_CALL_CONSTRAINT),
+                   [val] "D" (x));
+
+    return r;
+}
+#define arch_hweightl arch_hweightl
+
 #endif /* _X86_BITOPS_H */
diff --git a/xen/lib/Makefile b/xen/lib/Makefile
index b6558e108bd9..54440f628aae 100644
--- a/xen/lib/Makefile
+++ b/xen/lib/Makefile
@@ -36,6 +36,7 @@  lib-y += strtol.o
 lib-y += strtoll.o
 lib-y += strtoul.o
 lib-y += strtoull.o
+lib-$(CONFIG_X86) += x86-generic-hweightl.o
 lib-$(CONFIG_X86) += xxhash32.o
 lib-$(CONFIG_X86) += xxhash64.o
 
diff --git a/xen/lib/x86-generic-hweightl.c b/xen/lib/x86-generic-hweightl.c
new file mode 100644
index 000000000000..123a5b43928d
--- /dev/null
+++ b/xen/lib/x86-generic-hweightl.c
@@ -0,0 +1,71 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <xen/bitops.h>
+#include <xen/init.h>
+#include <xen/self-tests.h>
+
+/*
+ * An implementation of generic_hweightl() used on hardware without the POPCNT
+ * instruction.
+ *
+ * This function is called from within an ALTERNATIVE in arch_hweightl().
+ * i.e. behind the back of the compiler.  Therefore all registers are callee
+ * preserved.
+ *
+ * The ASM is what GCC-12 emits for generic_hweightl() in a release build of
+ * Xen, with spilling of %rdi/%rdx to preserve the callers registers.
+ *
+ * Note: When we can use __attribute__((no_caller_saved_registers))
+ *       unconditionally (GCC 7, Clang 5), we can implement this in plain C.
+ */
+asm (
+    ".type arch_generic_hweightl, STT_FUNC\n\t"
+    ".globl arch_generic_hweightl\n\t"
+    ".hidden arch_generic_hweightl\n\t"
+    ".balign " STR(CONFIG_FUNCTION_ALIGNMENT) ", 0x90\n" /* CODE_FILL */
+    "arch_generic_hweightl:\n\t"
+
+    "push   %rdi\n\t"
+    "push   %rdx\n\t"
+
+    "movabs $0x5555555555555555, %rdx\n\t"
+    "mov    %rdi, %rax\n\t"
+    "shr    $1, %rax\n\t"
+    "and    %rdx, %rax\n\t"
+    "sub    %rax, %rdi\n\t"
+    "movabs $0x3333333333333333, %rax\n\t"
+    "mov    %rdi, %rdx\n\t"
+    "shr    $2, %rdi\n\t"
+    "and    %rax, %rdx\n\t"
+    "and    %rax, %rdi\n\t"
+    "add    %rdi, %rdx\n\t"
+    "mov    %rdx, %rax\n\t"
+    "shr    $4, %rax\n\t"
+    "add    %rdx, %rax\n\t"
+    "movabs $0x0f0f0f0f0f0f0f0f, %rdx\n\t"
+    "and    %rdx, %rax\n\t"
+    "movabs $0x0101010101010101, %rdx\n\t"
+    "imul   %rdx, %rax\n\t"
+    "shr    $" STR(BITS_PER_LONG) "- 8, %rax\n\t"
+
+    "pop    %rdx\n\t"
+    "pop    %rdi\n\t"
+
+    "ret\n\t"
+
+    ".size arch_generic_hweightl, . - arch_generic_hweightl\n\t"
+);
+
+#ifdef CONFIG_SELF_TESTS
+static void __init __constructor test_arch_generic_hweightl(void)
+{
+    RUNTIME_CHECK(arch_generic_hweightl, 0, 0);
+    RUNTIME_CHECK(arch_generic_hweightl, 1, 1);
+    RUNTIME_CHECK(arch_generic_hweightl, 3, 2);
+    RUNTIME_CHECK(arch_generic_hweightl, 7, 3);
+    RUNTIME_CHECK(arch_generic_hweightl, 0xff, 8);
+
+    RUNTIME_CHECK(arch_generic_hweightl, 1 | (1UL << (BITS_PER_LONG - 1)), 2);
+    RUNTIME_CHECK(arch_generic_hweightl, -1UL, BITS_PER_LONG);
+}
+#endif