diff mbox series

[v2,02/70] xen/sort: Switch to an extern inline implementation

Message ID 20220214125127.17985-3-andrew.cooper3@citrix.com (mailing list archive)
State New, archived
Headers show
Series x86: Support for CET Indirect Branch Tracking | expand

Commit Message

Andrew Cooper Feb. 14, 2022, 12:50 p.m. UTC
There are exactly 3 callers of sort() in the hypervisor.  Callbacks in a tight
loop like this are problematic for performance, especially with Spectre v2
protections, which is why extern inline is used commonly by libraries.

Both ARM callers pass in NULL for the swap function, and while this might seem
like an attractive option at first, it causes generic_swap() to be used, which
forced a byte-wise copy.  Provide real swap functions so the compiler can
optimise properly, which is very important for ARM downstreams where
milliseconds until the system is up matters.

No functional change.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
CC: Stefano Stabellini <sstabellini@kernel.org>
CC: Julien Grall <julien@xen.org>
CC: Volodymyr Babchuk <Volodymyr_Babchuk@epam.com>
CC: Bertrand Marquis <bertrand.marquis@arm.com>

v2:
 * Adjust commit message
---
 xen/arch/arm/bootfdt.c |  9 +++++-
 xen/arch/arm/io.c      |  9 +++++-
 xen/include/xen/sort.h | 55 +++++++++++++++++++++++++++++++++-
 xen/lib/sort.c         | 80 ++------------------------------------------------
 4 files changed, 72 insertions(+), 81 deletions(-)

Comments

Bertrand Marquis Feb. 14, 2022, 1:13 p.m. UTC | #1
Hi Andrew,

> On 14 Feb 2022, at 12:50, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
> 
> There are exactly 3 callers of sort() in the hypervisor.  Callbacks in a tight
> loop like this are problematic for performance, especially with Spectre v2
> protections, which is why extern inline is used commonly by libraries.
> 
> Both ARM callers pass in NULL for the swap function, and while this might seem
> like an attractive option at first, it causes generic_swap() to be used, which
> forced a byte-wise copy.  Provide real swap functions so the compiler can
> optimise properly, which is very important for ARM downstreams where
> milliseconds until the system is up matters.
> 
> No functional change.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>

Just one comment fix after, with it fixed for the arm part:

Reviewed-by: Bertrand Marquis <bertrand.marquis@arm.com>

> ---
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Roger Pau Monné <roger.pau@citrix.com>
> CC: Wei Liu <wl@xen.org>
> CC: Stefano Stabellini <sstabellini@kernel.org>
> CC: Julien Grall <julien@xen.org>
> CC: Volodymyr Babchuk <Volodymyr_Babchuk@epam.com>
> CC: Bertrand Marquis <bertrand.marquis@arm.com>
> 
> v2:
> * Adjust commit message
> ---
> xen/arch/arm/bootfdt.c |  9 +++++-
> xen/arch/arm/io.c      |  9 +++++-
> xen/include/xen/sort.h | 55 +++++++++++++++++++++++++++++++++-
> xen/lib/sort.c         | 80 ++------------------------------------------------
> 4 files changed, 72 insertions(+), 81 deletions(-)
> 
> diff --git a/xen/arch/arm/bootfdt.c b/xen/arch/arm/bootfdt.c
> index afaa0e249b71..e318ef960386 100644
> --- a/xen/arch/arm/bootfdt.c
> +++ b/xen/arch/arm/bootfdt.c
> @@ -448,6 +448,13 @@ static int __init cmp_memory_node(const void *key, const void *elem)
>     return 0;
> }
> 
> +static void __init swap_memory_node(void *_a, void *_b, size_t size)
> +{
> +    struct membank *a = _a, *b = _b;
> +
> +    SWAP(*a, *b);
> +}
> +
> /**
>  * boot_fdt_info - initialize bootinfo from a DTB
>  * @fdt: flattened device tree binary
> @@ -472,7 +479,7 @@ size_t __init boot_fdt_info(const void *fdt, paddr_t paddr)
>      * the banks sorted in ascending order. So sort them through.
>      */
>     sort(bootinfo.mem.bank, bootinfo.mem.nr_banks, sizeof(struct membank),
> -         cmp_memory_node, NULL);
> +         cmp_memory_node, swap_memory_node);
> 
>     early_print_info();
> 
> diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
> index 729287e37c59..1a066f9ae502 100644
> --- a/xen/arch/arm/io.c
> +++ b/xen/arch/arm/io.c
> @@ -80,6 +80,13 @@ static int cmp_mmio_handler(const void *key, const void *elem)
>     return 0;
> }
> 
> +static void swap_mmio_handler(void *_a, void *_b, size_t size)
> +{
> +    struct mmio_handler *a = _a, *b = _b;
> +
> +    SWAP(*a, *b);
> +}
> +
> static const struct mmio_handler *find_mmio_handler(struct domain *d,
>                                                     paddr_t gpa)
> {
> @@ -170,7 +177,7 @@ void register_mmio_handler(struct domain *d,
> 
>     /* Sort mmio handlers in ascending order based on base address */
>     sort(vmmio->handlers, vmmio->num_entries, sizeof(struct mmio_handler),
> -         cmp_mmio_handler, NULL);
> +         cmp_mmio_handler, swap_mmio_handler);
> 
>     write_unlock(&vmmio->lock);
> }
> diff --git a/xen/include/xen/sort.h b/xen/include/xen/sort.h
> index a403652948e7..01479ea44606 100644
> --- a/xen/include/xen/sort.h
> +++ b/xen/include/xen/sort.h
> @@ -3,8 +3,61 @@
> 
> #include <xen/types.h>
> 
> +/*
> + * sort - sort an array of elements
> + * @base: pointer to data to sort
> + * @num: number of elements
> + * @size: size of each element
> + * @cmp: pointer to comparison function
> + * @swap: pointer to swap function or NULL

The function is not accepting anymore to have NULL as parameter.
The comment should be fixed here.

Bertrand

> + *
> + * This function does a heapsort on the given array. You may provide a
> + * swap function optimized to your element type.
> + *
> + * Sorting time is O(n log n) both on average and worst-case. While
> + * qsort is about 20% faster on average, it suffers from exploitable
> + * O(n*n) worst-case behavior and extra memory requirements that make
> + * it less suitable for kernel use.
> + */
> +#ifndef SORT_IMPLEMENTATION
> +extern gnu_inline
> +#endif
> void sort(void *base, size_t num, size_t size,
>           int (*cmp)(const void *, const void *),
> -          void (*swap)(void *, void *, size_t));
> +          void (*swap)(void *, void *, size_t))
> +{
> +    /* pre-scale counters for performance */
> +    size_t i = (num / 2) * size, n = num * size, c, r;
> +
> +    /* heapify */
> +    while ( i > 0 )
> +    {
> +        for ( r = i -= size; r * 2 + size < n; r = c )
> +        {
> +            c = r * 2 + size;
> +            if ( (c < n - size) && (cmp(base + c, base + c + size) < 0) )
> +                c += size;
> +            if ( cmp(base + r, base + c) >= 0 )
> +                break;
> +            swap(base + r, base + c, size);
> +        }
> +    }
> +
> +    /* sort */
> +    for ( i = n; i > 0; )
> +    {
> +        i -= size;
> +        swap(base, base + i, size);
> +        for ( r = 0; r * 2 + size < i; r = c )
> +        {
> +            c = r * 2 + size;
> +            if ( (c < i - size) && (cmp(base + c, base + c + size) < 0) )
> +                c += size;
> +            if ( cmp(base + r, base + c) >= 0 )
> +                break;
> +            swap(base + r, base + c, size);
> +        }
> +    }
> +}
> 
> #endif /* __XEN_SORT_H__ */
> diff --git a/xen/lib/sort.c b/xen/lib/sort.c
> index 35ce0d7abdec..b7e78cc0e8d2 100644
> --- a/xen/lib/sort.c
> +++ b/xen/lib/sort.c
> @@ -4,81 +4,5 @@
>  * Jan 23 2005  Matt Mackall <mpm@selenic.com>
>  */
> 
> -#include <xen/types.h>
> -
> -static void u32_swap(void *a, void *b, size_t size)
> -{
> -    uint32_t t = *(uint32_t *)a;
> -
> -    *(uint32_t *)a = *(uint32_t *)b;
> -    *(uint32_t *)b = t;
> -}
> -
> -static void generic_swap(void *a, void *b, size_t size)
> -{
> -    char t;
> -
> -    do {
> -        t = *(char *)a;
> -        *(char *)a++ = *(char *)b;
> -        *(char *)b++ = t;
> -    } while ( --size > 0 );
> -}
> -
> -/*
> - * sort - sort an array of elements
> - * @base: pointer to data to sort
> - * @num: number of elements
> - * @size: size of each element
> - * @cmp: pointer to comparison function
> - * @swap: pointer to swap function or NULL
> - *
> - * This function does a heapsort on the given array. You may provide a
> - * swap function optimized to your element type.
> - *
> - * Sorting time is O(n log n) both on average and worst-case. While
> - * qsort is about 20% faster on average, it suffers from exploitable
> - * O(n*n) worst-case behavior and extra memory requirements that make
> - * it less suitable for kernel use.
> - */
> -
> -void sort(void *base, size_t num, size_t size,
> -          int (*cmp)(const void *, const void *),
> -          void (*swap)(void *, void *, size_t size))
> -{
> -    /* pre-scale counters for performance */
> -    size_t i = (num / 2) * size, n = num * size, c, r;
> -
> -    if ( !swap )
> -        swap = (size == 4 ? u32_swap : generic_swap);
> -
> -    /* heapify */
> -    while ( i > 0 )
> -    {
> -        for ( r = i -= size; r * 2 + size < n; r = c )
> -        {
> -            c = r * 2 + size;
> -            if ( (c < n - size) && (cmp(base + c, base + c + size) < 0) )
> -                c += size;
> -            if ( cmp(base + r, base + c) >= 0 )
> -                break;
> -            swap(base + r, base + c, size);
> -        }
> -    }
> -
> -    /* sort */
> -    for ( i = n; i > 0; )
> -    {
> -        i -= size;
> -        swap(base, base + i, size);
> -        for ( r = 0; r * 2 + size < i; r = c )
> -        {
> -            c = r * 2 + size;
> -            if ( (c < i - size) && (cmp(base + c, base + c + size) < 0) )
> -                c += size;
> -            if ( cmp(base + r, base + c) >= 0 )
> -                break;
> -            swap(base + r, base + c, size);
> -        }
> -    }
> -}
> +#define SORT_IMPLEMENTATION
> +#include <xen/sort.h>
> -- 
> 2.11.0
>
Julien Grall Feb. 14, 2022, 1:17 p.m. UTC | #2
Hi,

On 14/02/2022 12:50, Andrew Cooper wrote:
> There are exactly 3 callers of sort() in the hypervisor.  Callbacks in a tight
> loop like this are problematic for performance, especially with Spectre v2
> protections, which is why extern inline is used commonly by libraries.
> 
> Both ARM callers pass in NULL for the swap function, and while this might seem
> like an attractive option at first, it causes generic_swap() to be used, which
> forced a byte-wise copy.  Provide real swap functions so the compiler can
> optimise properly, which is very important for ARM downstreams where
> milliseconds until the system is up matters.

Did you actually benchmark it? Both those lists will have < 128 elements 
in them. So I would be extremely surprised if you save more than a few 
hundreds microseconds with this approach.

So, my opinion on this approach hasn't changed. On v1, we discussed an 
approach that would suit both Stefano and I. Jan seemed to confirm that 
would also suit x86.

Therefore, for this approach:

Nacked-by: Julien Grall <jgrall@amazon.com>

Cheers,
Andrew Cooper Feb. 14, 2022, 6:30 p.m. UTC | #3
On 14/02/2022 13:13, Bertrand Marquis wrote:
> Hi Andrew,
>
>> On 14 Feb 2022, at 12:50, Andrew Cooper <andrew.cooper3@citrix.com> wrote:
>>
>> There are exactly 3 callers of sort() in the hypervisor.  Callbacks in a tight
>> loop like this are problematic for performance, especially with Spectre v2
>> protections, which is why extern inline is used commonly by libraries.
>>
>> Both ARM callers pass in NULL for the swap function, and while this might seem
>> like an attractive option at first, it causes generic_swap() to be used, which
>> forced a byte-wise copy.  Provide real swap functions so the compiler can
>> optimise properly, which is very important for ARM downstreams where
>> milliseconds until the system is up matters.
>>
>> No functional change.
>>
>> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
>> Reviewed-by: Jan Beulich <jbeulich@suse.com>
> Just one comment fix after, with it fixed for the arm part:
>
> Reviewed-by: Bertrand Marquis <bertrand.marquis@arm.com>

Thanks.

>> diff --git a/xen/include/xen/sort.h b/xen/include/xen/sort.h
>> index a403652948e7..01479ea44606 100644
>> --- a/xen/include/xen/sort.h
>> +++ b/xen/include/xen/sort.h
>> @@ -3,8 +3,61 @@
>>
>> #include <xen/types.h>
>>
>> +/*
>> + * sort - sort an array of elements
>> + * @base: pointer to data to sort
>> + * @num: number of elements
>> + * @size: size of each element
>> + * @cmp: pointer to comparison function
>> + * @swap: pointer to swap function or NULL
> The function is not accepting anymore to have NULL as parameter.
> The comment should be fixed here.

Will fix.

~Andrew
Stefano Stabellini Feb. 16, 2022, 3:46 a.m. UTC | #4
On Mon, 14 Feb 2022, Julien Grall wrote:
> On 14/02/2022 12:50, Andrew Cooper wrote:
> > There are exactly 3 callers of sort() in the hypervisor.  Callbacks in a
> > tight
> > loop like this are problematic for performance, especially with Spectre v2
> > protections, which is why extern inline is used commonly by libraries.
> > 
> > Both ARM callers pass in NULL for the swap function, and while this might
> > seem
> > like an attractive option at first, it causes generic_swap() to be used,
> > which
> > forced a byte-wise copy.  Provide real swap functions so the compiler can
> > optimise properly, which is very important for ARM downstreams where
> > milliseconds until the system is up matters.
> 
> Did you actually benchmark it? Both those lists will have < 128 elements in
> them. So I would be extremely surprised if you save more than a few hundreds
> microseconds with this approach.
> 
> So, my opinion on this approach hasn't changed. On v1, we discussed an
> approach that would suit both Stefano and I. Jan seemed to confirm that would
> also suit x86.


This patch series has become 70 patches and for the sake of helping
Andrew move forward in the quickest and most painless way possible, I
append the following using generic_swap as static inline.

Julien, Bertrand, is that acceptable to you?

Andrew, I know this is not your favorite approach but you have quite a
lot of changes to handle -- probably not worth focussing on one detail
which is pretty minor?


---
xen/sort: Switch to an extern inline implementation

There are exactly 3 callers of sort() in the hypervisor.  Callbacks in a tight
loop like this are problematic for performance, especially with Spectre v2
protections, which is why extern inline is used commonly by libraries.

Make generic_swap() a static inline and used it at the two ARM call
sites.

No functional change.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@xilinx.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
 xen/arch/arm/bootfdt.c |  2 +-
 xen/arch/arm/io.c      |  2 +-
 xen/include/xen/sort.h | 67 ++++++++++++++++++++++++++++++++++-
 xen/lib/sort.c         | 80 ++----------------------------------------
 4 files changed, 70 insertions(+), 81 deletions(-)

diff --git a/xen/arch/arm/bootfdt.c b/xen/arch/arm/bootfdt.c
index afaa0e249b..0d62945d56 100644
--- a/xen/arch/arm/bootfdt.c
+++ b/xen/arch/arm/bootfdt.c
@@ -472,7 +472,7 @@ size_t __init boot_fdt_info(const void *fdt, paddr_t paddr)
      * the banks sorted in ascending order. So sort them through.
      */
     sort(bootinfo.mem.bank, bootinfo.mem.nr_banks, sizeof(struct membank),
-         cmp_memory_node, NULL);
+         cmp_memory_node, generic_swap);
 
     early_print_info();
 
diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
index 729287e37c..1f35aaeea6 100644
--- a/xen/arch/arm/io.c
+++ b/xen/arch/arm/io.c
@@ -170,7 +170,7 @@ void register_mmio_handler(struct domain *d,
 
     /* Sort mmio handlers in ascending order based on base address */
     sort(vmmio->handlers, vmmio->num_entries, sizeof(struct mmio_handler),
-         cmp_mmio_handler, NULL);
+         cmp_mmio_handler, generic_swap);
 
     write_unlock(&vmmio->lock);
 }
diff --git a/xen/include/xen/sort.h b/xen/include/xen/sort.h
index a403652948..f6065eda58 100644
--- a/xen/include/xen/sort.h
+++ b/xen/include/xen/sort.h
@@ -3,8 +3,73 @@
 
 #include <xen/types.h>
 
+extern gnu_inline
+void generic_swap(void *a, void *b, size_t size)
+{
+    char t;
+
+    do {
+        t = *(char *)a;
+        *(char *)a++ = *(char *)b;
+        *(char *)b++ = t;
+    } while ( --size > 0 );
+}
+
+/*
+ * sort - sort an array of elements
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp: pointer to comparison function
+ * @swap: pointer to swap function or NULL
+ *
+ * This function does a heapsort on the given array. You may provide a
+ * swap function optimized to your element type.
+ *
+ * Sorting time is O(n log n) both on average and worst-case. While
+ * qsort is about 20% faster on average, it suffers from exploitable
+ * O(n*n) worst-case behavior and extra memory requirements that make
+ * it less suitable for kernel use.
+ */
+#ifndef SORT_IMPLEMENTATION
+extern gnu_inline
+#endif
 void sort(void *base, size_t num, size_t size,
           int (*cmp)(const void *, const void *),
-          void (*swap)(void *, void *, size_t));
+          void (*swap)(void *, void *, size_t))
+{
+    /* pre-scale counters for performance */
+    size_t i = (num / 2) * size, n = num * size, c, r;
+
+    /* heapify */
+    while ( i > 0 )
+    {
+        for ( r = i -= size; r * 2 + size < n; r = c )
+        {
+            c = r * 2 + size;
+            if ( (c < n - size) && (cmp(base + c, base + c + size) < 0) )
+                c += size;
+            if ( cmp(base + r, base + c) >= 0 )
+                break;
+            swap(base + r, base + c, size);
+        }
+    }
+
+    /* sort */
+    for ( i = n; i > 0; )
+    {
+        i -= size;
+        swap(base, base + i, size);
+        for ( r = 0; r * 2 + size < i; r = c )
+        {
+            c = r * 2 + size;
+            if ( (c < i - size) && (cmp(base + c, base + c + size) < 0) )
+                c += size;
+            if ( cmp(base + r, base + c) >= 0 )
+                break;
+            swap(base + r, base + c, size);
+        }
+    }
+}
 
 #endif /* __XEN_SORT_H__ */
diff --git a/xen/lib/sort.c b/xen/lib/sort.c
index 35ce0d7abd..b7e78cc0e8 100644
--- a/xen/lib/sort.c
+++ b/xen/lib/sort.c
@@ -4,81 +4,5 @@
  * Jan 23 2005  Matt Mackall <mpm@selenic.com>
  */
 
-#include <xen/types.h>
-
-static void u32_swap(void *a, void *b, size_t size)
-{
-    uint32_t t = *(uint32_t *)a;
-
-    *(uint32_t *)a = *(uint32_t *)b;
-    *(uint32_t *)b = t;
-}
-
-static void generic_swap(void *a, void *b, size_t size)
-{
-    char t;
-
-    do {
-        t = *(char *)a;
-        *(char *)a++ = *(char *)b;
-        *(char *)b++ = t;
-    } while ( --size > 0 );
-}
-
-/*
- * sort - sort an array of elements
- * @base: pointer to data to sort
- * @num: number of elements
- * @size: size of each element
- * @cmp: pointer to comparison function
- * @swap: pointer to swap function or NULL
- *
- * This function does a heapsort on the given array. You may provide a
- * swap function optimized to your element type.
- *
- * Sorting time is O(n log n) both on average and worst-case. While
- * qsort is about 20% faster on average, it suffers from exploitable
- * O(n*n) worst-case behavior and extra memory requirements that make
- * it less suitable for kernel use.
- */
-
-void sort(void *base, size_t num, size_t size,
-          int (*cmp)(const void *, const void *),
-          void (*swap)(void *, void *, size_t size))
-{
-    /* pre-scale counters for performance */
-    size_t i = (num / 2) * size, n = num * size, c, r;
-
-    if ( !swap )
-        swap = (size == 4 ? u32_swap : generic_swap);
-
-    /* heapify */
-    while ( i > 0 )
-    {
-        for ( r = i -= size; r * 2 + size < n; r = c )
-        {
-            c = r * 2 + size;
-            if ( (c < n - size) && (cmp(base + c, base + c + size) < 0) )
-                c += size;
-            if ( cmp(base + r, base + c) >= 0 )
-                break;
-            swap(base + r, base + c, size);
-        }
-    }
-
-    /* sort */
-    for ( i = n; i > 0; )
-    {
-        i -= size;
-        swap(base, base + i, size);
-        for ( r = 0; r * 2 + size < i; r = c )
-        {
-            c = r * 2 + size;
-            if ( (c < i - size) && (cmp(base + c, base + c + size) < 0) )
-                c += size;
-            if ( cmp(base + r, base + c) >= 0 )
-                break;
-            swap(base + r, base + c, size);
-        }
-    }
-}
+#define SORT_IMPLEMENTATION
+#include <xen/sort.h>
Bertrand Marquis Feb. 16, 2022, 9:29 a.m. UTC | #5
Hi Stefano,

> On 16 Feb 2022, at 03:46, Stefano Stabellini <sstabellini@kernel.org> wrote:
> 
> On Mon, 14 Feb 2022, Julien Grall wrote:
>> On 14/02/2022 12:50, Andrew Cooper wrote:
>>> There are exactly 3 callers of sort() in the hypervisor.  Callbacks in a
>>> tight
>>> loop like this are problematic for performance, especially with Spectre v2
>>> protections, which is why extern inline is used commonly by libraries.
>>> 
>>> Both ARM callers pass in NULL for the swap function, and while this might
>>> seem
>>> like an attractive option at first, it causes generic_swap() to be used,
>>> which
>>> forced a byte-wise copy.  Provide real swap functions so the compiler can
>>> optimise properly, which is very important for ARM downstreams where
>>> milliseconds until the system is up matters.
>> 
>> Did you actually benchmark it? Both those lists will have < 128 elements in
>> them. So I would be extremely surprised if you save more than a few hundreds
>> microseconds with this approach.
>> 
>> So, my opinion on this approach hasn't changed. On v1, we discussed an
>> approach that would suit both Stefano and I. Jan seemed to confirm that would
>> also suit x86.
> 
> 
> This patch series has become 70 patches and for the sake of helping
> Andrew move forward in the quickest and most painless way possible, I
> append the following using generic_swap as static inline.
> 
> Julien, Bertrand, is that acceptable to you?

Any reason why we cannot in this case keep the NULL parameter in the
existing code and do the if swap == NULL handling in the sort code ?

Other then that this is acceptable for me but I will let Julien say if he is
ok or not as I had no objections before.

Regards
Bertrand

> 
> Andrew, I know this is not your favorite approach but you have quite a
> lot of changes to handle -- probably not worth focussing on one detail
> which is pretty minor?
> 
> 
> ---
> xen/sort: Switch to an extern inline implementation
> 
> There are exactly 3 callers of sort() in the hypervisor.  Callbacks in a tight
> loop like this are problematic for performance, especially with Spectre v2
> protections, which is why extern inline is used commonly by libraries.
> 
> Make generic_swap() a static inline and used it at the two ARM call
> sites.
> 
> No functional change.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Signed-off-by: Stefano Stabellini <stefano.stabellini@xilinx.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>
> ---
> xen/arch/arm/bootfdt.c |  2 +-
> xen/arch/arm/io.c      |  2 +-
> xen/include/xen/sort.h | 67 ++++++++++++++++++++++++++++++++++-
> xen/lib/sort.c         | 80 ++----------------------------------------
> 4 files changed, 70 insertions(+), 81 deletions(-)
> 
> diff --git a/xen/arch/arm/bootfdt.c b/xen/arch/arm/bootfdt.c
> index afaa0e249b..0d62945d56 100644
> --- a/xen/arch/arm/bootfdt.c
> +++ b/xen/arch/arm/bootfdt.c
> @@ -472,7 +472,7 @@ size_t __init boot_fdt_info(const void *fdt, paddr_t paddr)
>      * the banks sorted in ascending order. So sort them through.
>      */
>     sort(bootinfo.mem.bank, bootinfo.mem.nr_banks, sizeof(struct membank),
> -         cmp_memory_node, NULL);
> +         cmp_memory_node, generic_swap);
> 
>     early_print_info();
> 
> diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
> index 729287e37c..1f35aaeea6 100644
> --- a/xen/arch/arm/io.c
> +++ b/xen/arch/arm/io.c
> @@ -170,7 +170,7 @@ void register_mmio_handler(struct domain *d,
> 
>     /* Sort mmio handlers in ascending order based on base address */
>     sort(vmmio->handlers, vmmio->num_entries, sizeof(struct mmio_handler),
> -         cmp_mmio_handler, NULL);
> +         cmp_mmio_handler, generic_swap);
> 
>     write_unlock(&vmmio->lock);
> }
> diff --git a/xen/include/xen/sort.h b/xen/include/xen/sort.h
> index a403652948..f6065eda58 100644
> --- a/xen/include/xen/sort.h
> +++ b/xen/include/xen/sort.h
> @@ -3,8 +3,73 @@
> 
> #include <xen/types.h>
> 
> +extern gnu_inline
> +void generic_swap(void *a, void *b, size_t size)
> +{
> +    char t;
> +
> +    do {
> +        t = *(char *)a;
> +        *(char *)a++ = *(char *)b;
> +        *(char *)b++ = t;
> +    } while ( --size > 0 );
> +}
> +
> +/*
> + * sort - sort an array of elements
> + * @base: pointer to data to sort
> + * @num: number of elements
> + * @size: size of each element
> + * @cmp: pointer to comparison function
> + * @swap: pointer to swap function or NULL
> + *
> + * This function does a heapsort on the given array. You may provide a
> + * swap function optimized to your element type.
> + *
> + * Sorting time is O(n log n) both on average and worst-case. While
> + * qsort is about 20% faster on average, it suffers from exploitable
> + * O(n*n) worst-case behavior and extra memory requirements that make
> + * it less suitable for kernel use.
> + */
> +#ifndef SORT_IMPLEMENTATION
> +extern gnu_inline
> +#endif
> void sort(void *base, size_t num, size_t size,
>           int (*cmp)(const void *, const void *),
> -          void (*swap)(void *, void *, size_t));
> +          void (*swap)(void *, void *, size_t))
> +{
> +    /* pre-scale counters for performance */
> +    size_t i = (num / 2) * size, n = num * size, c, r;
> +
> +    /* heapify */
> +    while ( i > 0 )
> +    {
> +        for ( r = i -= size; r * 2 + size < n; r = c )
> +        {
> +            c = r * 2 + size;
> +            if ( (c < n - size) && (cmp(base + c, base + c + size) < 0) )
> +                c += size;
> +            if ( cmp(base + r, base + c) >= 0 )
> +                break;
> +            swap(base + r, base + c, size);
> +        }
> +    }
> +
> +    /* sort */
> +    for ( i = n; i > 0; )
> +    {
> +        i -= size;
> +        swap(base, base + i, size);
> +        for ( r = 0; r * 2 + size < i; r = c )
> +        {
> +            c = r * 2 + size;
> +            if ( (c < i - size) && (cmp(base + c, base + c + size) < 0) )
> +                c += size;
> +            if ( cmp(base + r, base + c) >= 0 )
> +                break;
> +            swap(base + r, base + c, size);
> +        }
> +    }
> +}
> 
> #endif /* __XEN_SORT_H__ */
> diff --git a/xen/lib/sort.c b/xen/lib/sort.c
> index 35ce0d7abd..b7e78cc0e8 100644
> --- a/xen/lib/sort.c
> +++ b/xen/lib/sort.c
> @@ -4,81 +4,5 @@
>  * Jan 23 2005  Matt Mackall <mpm@selenic.com>
>  */
> 
> -#include <xen/types.h>
> -
> -static void u32_swap(void *a, void *b, size_t size)
> -{
> -    uint32_t t = *(uint32_t *)a;
> -
> -    *(uint32_t *)a = *(uint32_t *)b;
> -    *(uint32_t *)b = t;
> -}
> -
> -static void generic_swap(void *a, void *b, size_t size)
> -{
> -    char t;
> -
> -    do {
> -        t = *(char *)a;
> -        *(char *)a++ = *(char *)b;
> -        *(char *)b++ = t;
> -    } while ( --size > 0 );
> -}
> -
> -/*
> - * sort - sort an array of elements
> - * @base: pointer to data to sort
> - * @num: number of elements
> - * @size: size of each element
> - * @cmp: pointer to comparison function
> - * @swap: pointer to swap function or NULL
> - *
> - * This function does a heapsort on the given array. You may provide a
> - * swap function optimized to your element type.
> - *
> - * Sorting time is O(n log n) both on average and worst-case. While
> - * qsort is about 20% faster on average, it suffers from exploitable
> - * O(n*n) worst-case behavior and extra memory requirements that make
> - * it less suitable for kernel use.
> - */
> -
> -void sort(void *base, size_t num, size_t size,
> -          int (*cmp)(const void *, const void *),
> -          void (*swap)(void *, void *, size_t size))
> -{
> -    /* pre-scale counters for performance */
> -    size_t i = (num / 2) * size, n = num * size, c, r;
> -
> -    if ( !swap )
> -        swap = (size == 4 ? u32_swap : generic_swap);
> -
> -    /* heapify */
> -    while ( i > 0 )
> -    {
> -        for ( r = i -= size; r * 2 + size < n; r = c )
> -        {
> -            c = r * 2 + size;
> -            if ( (c < n - size) && (cmp(base + c, base + c + size) < 0) )
> -                c += size;
> -            if ( cmp(base + r, base + c) >= 0 )
> -                break;
> -            swap(base + r, base + c, size);
> -        }
> -    }
> -
> -    /* sort */
> -    for ( i = n; i > 0; )
> -    {
> -        i -= size;
> -        swap(base, base + i, size);
> -        for ( r = 0; r * 2 + size < i; r = c )
> -        {
> -            c = r * 2 + size;
> -            if ( (c < i - size) && (cmp(base + c, base + c + size) < 0) )
> -                c += size;
> -            if ( cmp(base + r, base + c) >= 0 )
> -                break;
> -            swap(base + r, base + c, size);
> -        }
> -    }
> -}
> +#define SORT_IMPLEMENTATION
> +#include <xen/sort.h>
> -- 
> 2.25.1
>
Andrew Cooper Feb. 16, 2022, 10:44 a.m. UTC | #6
On 16/02/2022 03:46, Stefano Stabellini wrote:
> On Mon, 14 Feb 2022, Julien Grall wrote:
>> On 14/02/2022 12:50, Andrew Cooper wrote:
>>> There are exactly 3 callers of sort() in the hypervisor.  Callbacks in a
>>> tight
>>> loop like this are problematic for performance, especially with Spectre v2
>>> protections, which is why extern inline is used commonly by libraries.
>>>
>>> Both ARM callers pass in NULL for the swap function, and while this might
>>> seem
>>> like an attractive option at first, it causes generic_swap() to be used,
>>> which
>>> forced a byte-wise copy.  Provide real swap functions so the compiler can
>>> optimise properly, which is very important for ARM downstreams where
>>> milliseconds until the system is up matters.
>> Did you actually benchmark it? Both those lists will have < 128 elements in
>> them. So I would be extremely surprised if you save more than a few hundreds
>> microseconds with this approach.
>>
>> So, my opinion on this approach hasn't changed. On v1, we discussed an
>> approach that would suit both Stefano and I. Jan seemed to confirm that would
>> also suit x86.
> This patch series has become 70 patches and for the sake of helping
> Andrew move forward in the quickest and most painless way possible, I
> append the following using generic_swap as static inline.
>
> Julien, Bertrand, is that acceptable to you?
>
> Andrew, I know this is not your favorite approach but you have quite a
> lot of changes to handle -- probably not worth focussing on one detail
> which is pretty minor?

It's not pretty minor.  My version really is the best thing for ARM.

The perf improvement alone, marginal as it may be in practice, is
justification alone for the patch, and Bertrand's R-by is testament to this.

But the reasons why getting rid the swap functions is important for
CET-IBT on x86 are exactly the same as why getting rid of them on ARM
will be important for BTI support.  A tagged function doing an arbitrary
bytewise swap from two parameters controlled by the third is far more
valuable to an attackers gadget library than a typical function.

i.e. this proposed intermediary, if it compiles, is just busywork which
someone else is going to have to revert in the future, along with having
this argument again.

~Andrew
Julien Grall Feb. 16, 2022, 11:46 a.m. UTC | #7
Hi,

On 16/02/2022 10:44, Andrew Cooper wrote:
> On 16/02/2022 03:46, Stefano Stabellini wrote:
>> On Mon, 14 Feb 2022, Julien Grall wrote:
>>> On 14/02/2022 12:50, Andrew Cooper wrote:
>>>> There are exactly 3 callers of sort() in the hypervisor.  Callbacks in a
>>>> tight
>>>> loop like this are problematic for performance, especially with Spectre v2
>>>> protections, which is why extern inline is used commonly by libraries.
>>>>
>>>> Both ARM callers pass in NULL for the swap function, and while this might
>>>> seem
>>>> like an attractive option at first, it causes generic_swap() to be used,
>>>> which
>>>> forced a byte-wise copy.  Provide real swap functions so the compiler can
>>>> optimise properly, which is very important for ARM downstreams where
>>>> milliseconds until the system is up matters.
>>> Did you actually benchmark it? Both those lists will have < 128 elements in
>>> them. So I would be extremely surprised if you save more than a few hundreds
>>> microseconds with this approach.
>>>
>>> So, my opinion on this approach hasn't changed. On v1, we discussed an
>>> approach that would suit both Stefano and I. Jan seemed to confirm that would
>>> also suit x86.
>> This patch series has become 70 patches and for the sake of helping
>> Andrew move forward in the quickest and most painless way possible, I
>> append the following using generic_swap as static inline.
>>
>> Julien, Bertrand, is that acceptable to you?
>>
>> Andrew, I know this is not your favorite approach but you have quite a
>> lot of changes to handle -- probably not worth focussing on one detail
>> which is pretty minor?
> 
> It's not pretty minor.  My version really is the best thing for ARM.
 >
> The perf improvement alone, marginal as it may be in practice, is

Our take on Arm has always been to avoid micro-optimization when the 
resulting code is more difficult to maintain.

> justification alone for the patch, and Bertrand's R-by is testament to this.

I am not sure why calling out that Bertrand agreed means that everyone 
else should accept your approach.

This reminds me other series that have been blocked for a long time by 
you. Yet you made no effort to compromise... How ironic.

> 
> But the reasons why getting rid the swap functions is important for
> CET-IBT on x86 are exactly the same as why getting rid of them on ARM
> will be important for BTI support.  A tagged function doing an arbitrary
> bytewise swap from two parameters controlled by the third is far more
> valuable to an attackers gadget library than a typical function.

This is a more compelling reason. However, I am a bit puzzled why it 
took you so long to come up with this reason.

> i.e. this proposed intermediary, if it compiles, is just busywork which
> someone else is going to have to revert in the future, along with having
> this argument again.

Well, this argument would have never happened if your commit message 
contained information about BTI.

Instead you decided to just mention the performance part despite me 
objecting it and requesting for a different reason in v1 (see [1]).

Anyway, I will wait for a reword of the commit message before lifting my 
Nacked-by.

Cheers,

[1] 
https://lore.kernel.org/xen-devel/f7bb7a08-4721-f2a8-8602-0cd1baf1f083@xen.org/
Bertrand Marquis Feb. 16, 2022, 11:55 a.m. UTC | #8
Hi,

> On 16 Feb 2022, at 11:46, Julien Grall <julien@xen.org> wrote:
> 
> Hi,
> 
> On 16/02/2022 10:44, Andrew Cooper wrote:
>> On 16/02/2022 03:46, Stefano Stabellini wrote:
>>> On Mon, 14 Feb 2022, Julien Grall wrote:
>>>> On 14/02/2022 12:50, Andrew Cooper wrote:
>>>>> There are exactly 3 callers of sort() in the hypervisor.  Callbacks in a
>>>>> tight
>>>>> loop like this are problematic for performance, especially with Spectre v2
>>>>> protections, which is why extern inline is used commonly by libraries.
>>>>> 
>>>>> Both ARM callers pass in NULL for the swap function, and while this might
>>>>> seem
>>>>> like an attractive option at first, it causes generic_swap() to be used,
>>>>> which
>>>>> forced a byte-wise copy.  Provide real swap functions so the compiler can
>>>>> optimise properly, which is very important for ARM downstreams where
>>>>> milliseconds until the system is up matters.
>>>> Did you actually benchmark it? Both those lists will have < 128 elements in
>>>> them. So I would be extremely surprised if you save more than a few hundreds
>>>> microseconds with this approach.
>>>> 
>>>> So, my opinion on this approach hasn't changed. On v1, we discussed an
>>>> approach that would suit both Stefano and I. Jan seemed to confirm that would
>>>> also suit x86.
>>> This patch series has become 70 patches and for the sake of helping
>>> Andrew move forward in the quickest and most painless way possible, I
>>> append the following using generic_swap as static inline.
>>> 
>>> Julien, Bertrand, is that acceptable to you?
>>> 
>>> Andrew, I know this is not your favorite approach but you have quite a
>>> lot of changes to handle -- probably not worth focussing on one detail
>>> which is pretty minor?
>> It's not pretty minor.  My version really is the best thing for ARM.
> >
>> The perf improvement alone, marginal as it may be in practice, is
> 
> Our take on Arm has always been to avoid micro-optimization when the resulting code is more difficult to maintain.
> 
>> justification alone for the patch, and Bertrand's R-by is testament to this.
> 
> I am not sure why calling out that Bertrand agreed means that everyone else should accept your approach.
> 
> This reminds me other series that have been blocked for a long time by you. Yet you made no effort to compromise... How ironic.
> 
>> But the reasons why getting rid the swap functions is important for
>> CET-IBT on x86 are exactly the same as why getting rid of them on ARM
>> will be important for BTI support.  A tagged function doing an arbitrary
>> bytewise swap from two parameters controlled by the third is far more
>> valuable to an attackers gadget library than a typical function.
> 
> This is a more compelling reason. However, I am a bit puzzled why it took you so long to come up with this reason.
> 
>> i.e. this proposed intermediary, if it compiles, is just busywork which
>> someone else is going to have to revert in the future, along with having
>> this argument again.
> 
> Well, this argument would have never happened if your commit message contained information about BTI.

I agree that this would be nice to have in the commit message as a justification for the change.

Cheers
Bertrand

> 
> Instead you decided to just mention the performance part despite me objecting it and requesting for a different reason in v1 (see [1]).
> 
> Anyway, I will wait for a reword of the commit message before lifting my Nacked-by.
> 
> Cheers,
> 
> [1] https://lore.kernel.org/xen-devel/f7bb7a08-4721-f2a8-8602-0cd1baf1f083@xen.org/
> 
> -- 
> Julien Grall
diff mbox series

Patch

diff --git a/xen/arch/arm/bootfdt.c b/xen/arch/arm/bootfdt.c
index afaa0e249b71..e318ef960386 100644
--- a/xen/arch/arm/bootfdt.c
+++ b/xen/arch/arm/bootfdt.c
@@ -448,6 +448,13 @@  static int __init cmp_memory_node(const void *key, const void *elem)
     return 0;
 }
 
+static void __init swap_memory_node(void *_a, void *_b, size_t size)
+{
+    struct membank *a = _a, *b = _b;
+
+    SWAP(*a, *b);
+}
+
 /**
  * boot_fdt_info - initialize bootinfo from a DTB
  * @fdt: flattened device tree binary
@@ -472,7 +479,7 @@  size_t __init boot_fdt_info(const void *fdt, paddr_t paddr)
      * the banks sorted in ascending order. So sort them through.
      */
     sort(bootinfo.mem.bank, bootinfo.mem.nr_banks, sizeof(struct membank),
-         cmp_memory_node, NULL);
+         cmp_memory_node, swap_memory_node);
 
     early_print_info();
 
diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
index 729287e37c59..1a066f9ae502 100644
--- a/xen/arch/arm/io.c
+++ b/xen/arch/arm/io.c
@@ -80,6 +80,13 @@  static int cmp_mmio_handler(const void *key, const void *elem)
     return 0;
 }
 
+static void swap_mmio_handler(void *_a, void *_b, size_t size)
+{
+    struct mmio_handler *a = _a, *b = _b;
+
+    SWAP(*a, *b);
+}
+
 static const struct mmio_handler *find_mmio_handler(struct domain *d,
                                                     paddr_t gpa)
 {
@@ -170,7 +177,7 @@  void register_mmio_handler(struct domain *d,
 
     /* Sort mmio handlers in ascending order based on base address */
     sort(vmmio->handlers, vmmio->num_entries, sizeof(struct mmio_handler),
-         cmp_mmio_handler, NULL);
+         cmp_mmio_handler, swap_mmio_handler);
 
     write_unlock(&vmmio->lock);
 }
diff --git a/xen/include/xen/sort.h b/xen/include/xen/sort.h
index a403652948e7..01479ea44606 100644
--- a/xen/include/xen/sort.h
+++ b/xen/include/xen/sort.h
@@ -3,8 +3,61 @@ 
 
 #include <xen/types.h>
 
+/*
+ * sort - sort an array of elements
+ * @base: pointer to data to sort
+ * @num: number of elements
+ * @size: size of each element
+ * @cmp: pointer to comparison function
+ * @swap: pointer to swap function or NULL
+ *
+ * This function does a heapsort on the given array. You may provide a
+ * swap function optimized to your element type.
+ *
+ * Sorting time is O(n log n) both on average and worst-case. While
+ * qsort is about 20% faster on average, it suffers from exploitable
+ * O(n*n) worst-case behavior and extra memory requirements that make
+ * it less suitable for kernel use.
+ */
+#ifndef SORT_IMPLEMENTATION
+extern gnu_inline
+#endif
 void sort(void *base, size_t num, size_t size,
           int (*cmp)(const void *, const void *),
-          void (*swap)(void *, void *, size_t));
+          void (*swap)(void *, void *, size_t))
+{
+    /* pre-scale counters for performance */
+    size_t i = (num / 2) * size, n = num * size, c, r;
+
+    /* heapify */
+    while ( i > 0 )
+    {
+        for ( r = i -= size; r * 2 + size < n; r = c )
+        {
+            c = r * 2 + size;
+            if ( (c < n - size) && (cmp(base + c, base + c + size) < 0) )
+                c += size;
+            if ( cmp(base + r, base + c) >= 0 )
+                break;
+            swap(base + r, base + c, size);
+        }
+    }
+
+    /* sort */
+    for ( i = n; i > 0; )
+    {
+        i -= size;
+        swap(base, base + i, size);
+        for ( r = 0; r * 2 + size < i; r = c )
+        {
+            c = r * 2 + size;
+            if ( (c < i - size) && (cmp(base + c, base + c + size) < 0) )
+                c += size;
+            if ( cmp(base + r, base + c) >= 0 )
+                break;
+            swap(base + r, base + c, size);
+        }
+    }
+}
 
 #endif /* __XEN_SORT_H__ */
diff --git a/xen/lib/sort.c b/xen/lib/sort.c
index 35ce0d7abdec..b7e78cc0e8d2 100644
--- a/xen/lib/sort.c
+++ b/xen/lib/sort.c
@@ -4,81 +4,5 @@ 
  * Jan 23 2005  Matt Mackall <mpm@selenic.com>
  */
 
-#include <xen/types.h>
-
-static void u32_swap(void *a, void *b, size_t size)
-{
-    uint32_t t = *(uint32_t *)a;
-
-    *(uint32_t *)a = *(uint32_t *)b;
-    *(uint32_t *)b = t;
-}
-
-static void generic_swap(void *a, void *b, size_t size)
-{
-    char t;
-
-    do {
-        t = *(char *)a;
-        *(char *)a++ = *(char *)b;
-        *(char *)b++ = t;
-    } while ( --size > 0 );
-}
-
-/*
- * sort - sort an array of elements
- * @base: pointer to data to sort
- * @num: number of elements
- * @size: size of each element
- * @cmp: pointer to comparison function
- * @swap: pointer to swap function or NULL
- *
- * This function does a heapsort on the given array. You may provide a
- * swap function optimized to your element type.
- *
- * Sorting time is O(n log n) both on average and worst-case. While
- * qsort is about 20% faster on average, it suffers from exploitable
- * O(n*n) worst-case behavior and extra memory requirements that make
- * it less suitable for kernel use.
- */
-
-void sort(void *base, size_t num, size_t size,
-          int (*cmp)(const void *, const void *),
-          void (*swap)(void *, void *, size_t size))
-{
-    /* pre-scale counters for performance */
-    size_t i = (num / 2) * size, n = num * size, c, r;
-
-    if ( !swap )
-        swap = (size == 4 ? u32_swap : generic_swap);
-
-    /* heapify */
-    while ( i > 0 )
-    {
-        for ( r = i -= size; r * 2 + size < n; r = c )
-        {
-            c = r * 2 + size;
-            if ( (c < n - size) && (cmp(base + c, base + c + size) < 0) )
-                c += size;
-            if ( cmp(base + r, base + c) >= 0 )
-                break;
-            swap(base + r, base + c, size);
-        }
-    }
-
-    /* sort */
-    for ( i = n; i > 0; )
-    {
-        i -= size;
-        swap(base, base + i, size);
-        for ( r = 0; r * 2 + size < i; r = c )
-        {
-            c = r * 2 + size;
-            if ( (c < i - size) && (cmp(base + c, base + c + size) < 0) )
-                c += size;
-            if ( cmp(base + r, base + c) >= 0 )
-                break;
-            swap(base + r, base + c, size);
-        }
-    }
-}
+#define SORT_IMPLEMENTATION
+#include <xen/sort.h>