diff mbox series

[2/7] xen/bitops: Implement ffs() in common logic

Message ID 20240313172716.2325427-3-andrew.cooper3@citrix.com (mailing list archive)
State New
Headers show
Series xen/bitops: Reduce the mess, starting with ffs() | expand

Commit Message

Andrew Cooper March 13, 2024, 5:27 p.m. UTC
Allow the optimiser to elimiate the call completely, and use the compiler
builtin by default.  Architectures should only proide arch_ffs() if they think
they can do better than the compiler.

Confirm the expected behaviour with compile time and boot time tests.

For x86, correct the prototype, and simplify the asm() with the statement
given by the Intel architects to Linux about the behaviour on processors newer
than the 486.

For PPC, __builtin_ffs() is 1/3 of the size of size of the transform to
generic_fls().  Drop the definition entirely.

For ARM, simply rename ffs() to arch_ffs().  It appears that the
transformation to __builtin_clz() still makes better code than
__builtin_ffs().

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
CC: Stefano Stabellini <sstabellini@kernel.org>
CC: Julien Grall <julien@xen.org>
CC: Volodymyr Babchuk <Volodymyr_Babchuk@epam.com>
CC: Bertrand Marquis <bertrand.marquis@arm.com>
CC: Michal Orzel <michal.orzel@amd.com>
CC: Oleksii Kurochko <oleksii.kurochko@gmail.com>
CC: Shawn Anastasio <sanastasio@raptorengineering.com>
CC: consulting@bugseng.com <consulting@bugseng.com>
CC: Simone Ballarin <simone.ballarin@bugseng.com>
CC: Federico Serafini <federico.serafini@bugseng.com>
CC: Nicola Vetrini <nicola.vetrini@bugseng.com>
---
 xen/arch/arm/include/asm/bitops.h |  2 +-
 xen/arch/ppc/include/asm/bitops.h |  1 -
 xen/arch/x86/include/asm/bitops.h | 19 +++++++++++++------
 xen/common/bitops.c               | 10 ++++++++++
 xen/include/xen/bitops.h          | 15 +++++++++++++++
 5 files changed, 39 insertions(+), 8 deletions(-)

Comments

Jan Beulich March 14, 2024, 2:16 p.m. UTC | #1
On 13.03.2024 18:27, Andrew Cooper wrote:
> --- a/xen/arch/arm/include/asm/bitops.h
> +++ b/xen/arch/arm/include/asm/bitops.h
> @@ -157,7 +157,7 @@ static inline int fls(unsigned int x)
>  }
>  
>  
> -#define ffs(x) ({ unsigned int __t = (x); fls(ISOLATE_LSB(__t)); })
> +#define arch_ffs(x) ({ unsigned int __t = (x); fls(ISOLATE_LSB(__t)); })

The way the macro is invoked, I don't think the helper local variable
is then needed anymore?

> --- a/xen/arch/x86/include/asm/bitops.h
> +++ b/xen/arch/x86/include/asm/bitops.h
> @@ -430,16 +430,23 @@ static inline int ffsl(unsigned long x)
>      return (int)r+1;
>  }
>  
> -static inline int ffs(unsigned int x)
> +static inline unsigned int arch_ffs(unsigned int x)
>  {
> -    int r;
> +    int r = -1;
> +
> +    /*
> +     * The AMD manual states that BSF won't modify the destination register if
> +     * x=0.  The Intel manual states that the result is undefined, but the
> +     * architects have said that the register is written back with it's old
> +     * value, possibly zero extended above 32 bits.
> +     */
> +    asm ( "bsf %[val], %[res]"
> +          : [res] "+r" (r)
> +          : [val] "rm" (x) );

And this isn't what the compiler would be doing anyway?

Also, just to mention it: I take it that you/we are sure that disallowing
both operands to be the same register is still better than ...

> -    asm ( "bsf %1,%0\n\t"
> -          "jnz 1f\n\t"
> -          "mov $-1,%0\n"
> -          "1:" : "=r" (r) : "rm" (x));

... the original form?

> --- a/xen/common/bitops.c
> +++ b/xen/common/bitops.c
> @@ -34,8 +34,18 @@
>          RUNTIME_CHECK(fn, val, res);            \
>      } while ( 0 )
>  
> +static void test_ffs(void)

Nit: __init please, even if there ought to be no reason for the compiler
to not inline this function.

> --- a/xen/include/xen/bitops.h
> +++ b/xen/include/xen/bitops.h
> @@ -110,6 +110,21 @@ static inline int generic_flsl(unsigned long x)
>  
>  #include <asm/bitops.h>
>  
> +/*
> + * Find First Set bit.  Bits are labelled from 1.
> + */
> +static always_inline __pure unsigned int ffs(unsigned int x)

Why always_inline?

> +{
> +    if ( __builtin_constant_p(x) )
> +        return __builtin_ffs(x);
> +
> +#ifndef arch_ffs
> +#define arch_ffs __builtin_ffs
> +#endif
> +
> +    return arch_ffs(x);
> +}

Just to mention it: __builtin_ffs() takes and returns plain int. I'm
happy about our own helper being unsigned-correct, but anything like
this has a Misra angle too.

Jan
Andrew Cooper March 14, 2024, 4:23 p.m. UTC | #2
On 14/03/2024 2:16 pm, Jan Beulich wrote:
> On 13.03.2024 18:27, Andrew Cooper wrote:
>> --- a/xen/arch/arm/include/asm/bitops.h
>> +++ b/xen/arch/arm/include/asm/bitops.h
>> @@ -157,7 +157,7 @@ static inline int fls(unsigned int x)
>>  }
>>  
>>  
>> -#define ffs(x) ({ unsigned int __t = (x); fls(ISOLATE_LSB(__t)); })
>> +#define arch_ffs(x) ({ unsigned int __t = (x); fls(ISOLATE_LSB(__t)); })
> The way the macro is invoked, I don't think the helper local variable
> is then needed anymore?

I strongly suspect It is still needed.  ISOLATE_LSB() double-expands its
parameter.

Either way, I'm not reopening that can of worms that lead to this form.
>> --- a/xen/arch/x86/include/asm/bitops.h
>> +++ b/xen/arch/x86/include/asm/bitops.h
>> @@ -430,16 +430,23 @@ static inline int ffsl(unsigned long x)
>>      return (int)r+1;
>>  }
>>  
>> -static inline int ffs(unsigned int x)
>> +static inline unsigned int arch_ffs(unsigned int x)
>>  {
>> -    int r;
>> +    int r = -1;
>> +
>> +    /*
>> +     * The AMD manual states that BSF won't modify the destination register if
>> +     * x=0.  The Intel manual states that the result is undefined, but the
>> +     * architects have said that the register is written back with it's old
>> +     * value, possibly zero extended above 32 bits.
>> +     */
>> +    asm ( "bsf %[val], %[res]"
>> +          : [res] "+r" (r)
>> +          : [val] "rm" (x) );
> And this isn't what the compiler would be doing anyway?

No.  The builtin avoids all undefined behaviour, and is quite a lot of
asm as a result.

With some help from the gcc mailing list
https://gcc.gnu.org/pipermail/gcc/2024-March/243465.html I've found a
solution which improves things in the common case.

> Also, just to mention it: I take it that you/we are sure that disallowing
> both operands to be the same register is still better than ...
>
>> -    asm ( "bsf %1,%0\n\t"
>> -          "jnz 1f\n\t"
>> -          "mov $-1,%0\n"
>> -          "1:" : "=r" (r) : "rm" (x));
> ... the original form?

Yes.  Without any doubt, on a 64bit CPU.

This transformation isn't safe on a 486, but I expect even the later
32bit CPUs lacking register renaming would still be better with the
non-branch form.


>> --- a/xen/include/xen/bitops.h
>> +++ b/xen/include/xen/bitops.h
>> @@ -110,6 +110,21 @@ static inline int generic_flsl(unsigned long x)
>>  
>>  #include <asm/bitops.h>
>>  
>> +/*
>> + * Find First Set bit.  Bits are labelled from 1.
>> + */
>> +static always_inline __pure unsigned int ffs(unsigned int x)
> Why always_inline?

For all the normal reasons to counter Clang and GCC doing stupid things
with inlines that contain assembly.

>
>> +{
>> +    if ( __builtin_constant_p(x) )
>> +        return __builtin_ffs(x);
>> +
>> +#ifndef arch_ffs
>> +#define arch_ffs __builtin_ffs
>> +#endif
>> +
>> +    return arch_ffs(x);
>> +}
> Just to mention it: __builtin_ffs() takes and returns plain int. I'm
> happy about our own helper being unsigned-correct, but anything like
> this has a Misra angle too.

I did note that, and decided it could wait until some other point.

~Andrew
Jan Beulich March 14, 2024, 4:35 p.m. UTC | #3
On 14.03.2024 17:23, Andrew Cooper wrote:
> On 14/03/2024 2:16 pm, Jan Beulich wrote:
>> On 13.03.2024 18:27, Andrew Cooper wrote:
>>> --- a/xen/arch/arm/include/asm/bitops.h
>>> +++ b/xen/arch/arm/include/asm/bitops.h
>>> @@ -157,7 +157,7 @@ static inline int fls(unsigned int x)
>>>  }
>>>  
>>>  
>>> -#define ffs(x) ({ unsigned int __t = (x); fls(ISOLATE_LSB(__t)); })
>>> +#define arch_ffs(x) ({ unsigned int __t = (x); fls(ISOLATE_LSB(__t)); })
>> The way the macro is invoked, I don't think the helper local variable
>> is then needed anymore?
> 
> I strongly suspect It is still needed.  ISOLATE_LSB() double-expands its
> parameter.

Even that double evaluation doesn't matter when the invoking entity is an
inline function, and it doesn't use any non-trivial expression as argument.

>>> --- a/xen/include/xen/bitops.h
>>> +++ b/xen/include/xen/bitops.h
>>> @@ -110,6 +110,21 @@ static inline int generic_flsl(unsigned long x)
>>>  
>>>  #include <asm/bitops.h>
>>>  
>>> +/*
>>> + * Find First Set bit.  Bits are labelled from 1.
>>> + */
>>> +static always_inline __pure unsigned int ffs(unsigned int x)
>> Why always_inline?
> 
> For all the normal reasons to counter Clang and GCC doing stupid things
> with inlines that contain assembly.

Hmm, there are issues when the asm() would look "complex" to the compiler,
but that's not the case here. I was asking because, as you imply by how
you responded, we may need to gain many more always_inline when at some
time even you were arguing against overriding compiler decisions like this
(unless I'm mis-remembering).

Jan
diff mbox series

Patch

diff --git a/xen/arch/arm/include/asm/bitops.h b/xen/arch/arm/include/asm/bitops.h
index ab030b6cb032..09c6064274a7 100644
--- a/xen/arch/arm/include/asm/bitops.h
+++ b/xen/arch/arm/include/asm/bitops.h
@@ -157,7 +157,7 @@  static inline int fls(unsigned int x)
 }
 
 
-#define ffs(x) ({ unsigned int __t = (x); fls(ISOLATE_LSB(__t)); })
+#define arch_ffs(x) ({ unsigned int __t = (x); fls(ISOLATE_LSB(__t)); })
 #define ffsl(x) ({ unsigned long __t = (x); flsl(ISOLATE_LSB(__t)); })
 
 /**
diff --git a/xen/arch/ppc/include/asm/bitops.h b/xen/arch/ppc/include/asm/bitops.h
index 5820b9ce7bb5..635a3b4e3e33 100644
--- a/xen/arch/ppc/include/asm/bitops.h
+++ b/xen/arch/ppc/include/asm/bitops.h
@@ -173,7 +173,6 @@  static inline int __test_and_clear_bit(int nr, volatile void *addr)
 
 #define flsl(x) generic_flsl(x)
 #define fls(x) generic_fls(x)
-#define ffs(x) ({ unsigned int t_ = (x); fls(t_ & -t_); })
 #define ffsl(x) ({ unsigned long t_ = (x); flsl(t_ & -t_); })
 
 /* Based on linux/include/asm-generic/bitops/ffz.h */
diff --git a/xen/arch/x86/include/asm/bitops.h b/xen/arch/x86/include/asm/bitops.h
index 5a71afbc89d5..2c5b103cbbd9 100644
--- a/xen/arch/x86/include/asm/bitops.h
+++ b/xen/arch/x86/include/asm/bitops.h
@@ -430,16 +430,23 @@  static inline int ffsl(unsigned long x)
     return (int)r+1;
 }
 
-static inline int ffs(unsigned int x)
+static inline unsigned int arch_ffs(unsigned int x)
 {
-    int r;
+    int r = -1;
+
+    /*
+     * The AMD manual states that BSF won't modify the destination register if
+     * x=0.  The Intel manual states that the result is undefined, but the
+     * architects have said that the register is written back with it's old
+     * value, possibly zero extended above 32 bits.
+     */
+    asm ( "bsf %[val], %[res]"
+          : [res] "+r" (r)
+          : [val] "rm" (x) );
 
-    asm ( "bsf %1,%0\n\t"
-          "jnz 1f\n\t"
-          "mov $-1,%0\n"
-          "1:" : "=r" (r) : "rm" (x));
     return r + 1;
 }
+#define arch_ffs arch_ffs
 
 /**
  * fls - find last bit set
diff --git a/xen/common/bitops.c b/xen/common/bitops.c
index 4c07191b4030..484df68768ad 100644
--- a/xen/common/bitops.c
+++ b/xen/common/bitops.c
@@ -34,8 +34,18 @@ 
         RUNTIME_CHECK(fn, val, res);            \
     } while ( 0 )
 
+static void test_ffs(void)
+{
+    /* unsigned int ffs(unsigned int) */
+    CHECK(ffs, 0, 0);
+    CHECK(ffs, 1, 1);
+    CHECK(ffs, 0x80000000U, 32);
+}
+
 static int __init cf_check test_bitops(void)
 {
+    test_ffs();
+
     return 0;
 }
 __initcall(test_bitops);
diff --git a/xen/include/xen/bitops.h b/xen/include/xen/bitops.h
index 9b40f20381a2..fb3645d9cf87 100644
--- a/xen/include/xen/bitops.h
+++ b/xen/include/xen/bitops.h
@@ -110,6 +110,21 @@  static inline int generic_flsl(unsigned long x)
 
 #include <asm/bitops.h>
 
+/*
+ * Find First Set bit.  Bits are labelled from 1.
+ */
+static always_inline __pure unsigned int ffs(unsigned int x)
+{
+    if ( __builtin_constant_p(x) )
+        return __builtin_ffs(x);
+
+#ifndef arch_ffs
+#define arch_ffs __builtin_ffs
+#endif
+
+    return arch_ffs(x);
+}
+
 /* --------------------- Please tidy below here --------------------- */
 
 #ifndef find_next_bit