diff mbox

[v3,02/16] x86: zero BSS using stosl instead of stosb

Message ID 1460723596-13261-3-git-send-email-daniel.kiper@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Daniel Kiper April 15, 2016, 12:33 p.m. UTC
Speedup BSS initialization by using stosl instead of stosb.

Some may argue that Intel Ivy Bridge and later provide ERMSB feature.
This means that "rep stosb" gives better throughput than "rep stosl" on
above mentioned CPUs. However, this feature is only available on newer
Intel processors and e.g. AMD does not provide it at all. So, stosb will
just give real benefits and even beat stosl only on limited number of
machines. On the other hand stosl will speedup BSS initialization on
all x86 platforms. Hence, use stosl instead of stosb.

Additionally, align relevant comment to coding style.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
---
v3 - suggestions/fixes:
   - improve comments
     (suggested by Konrad Rzeszutek Wilk),
   - improve commit message
     (suggested by Jan Beulich).
---
 xen/arch/x86/boot/head.S |    5 +++--
 xen/arch/x86/xen.lds.S   |    3 +++
 2 files changed, 6 insertions(+), 2 deletions(-)

Comments

Konrad Rzeszutek Wilk April 15, 2016, 1:57 p.m. UTC | #1
On Fri, Apr 15, 2016 at 02:33:02PM +0200, Daniel Kiper wrote:
> Speedup BSS initialization by using stosl instead of stosb.
> 
> Some may argue that Intel Ivy Bridge and later provide ERMSB feature.
> This means that "rep stosb" gives better throughput than "rep stosl" on
> above mentioned CPUs. However, this feature is only available on newer
> Intel processors and e.g. AMD does not provide it at all. So, stosb will
> just give real benefits and even beat stosl only on limited number of
> machines. On the other hand stosl will speedup BSS initialization on
> all x86 platforms. Hence, use stosl instead of stosb.
> 
> Additionally, align relevant comment to coding style.
> 
> Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>

Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> ---
> v3 - suggestions/fixes:
>    - improve comments
>      (suggested by Konrad Rzeszutek Wilk),
>    - improve commit message
>      (suggested by Jan Beulich).
> ---
>  xen/arch/x86/boot/head.S |    5 +++--
>  xen/arch/x86/xen.lds.S   |    3 +++
>  2 files changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S
> index f3501fd..32a54a0 100644
> --- a/xen/arch/x86/boot/head.S
> +++ b/xen/arch/x86/boot/head.S
> @@ -123,12 +123,13 @@ __start:
>          call    reloc
>          mov     %eax,sym_phys(multiboot_ptr)
>  
> -        /* Initialize BSS (no nasty surprises!) */
> +        /* Initialize BSS (no nasty surprises!). */
>          mov     $sym_phys(__bss_start),%edi
>          mov     $sym_phys(__bss_end),%ecx
>          sub     %edi,%ecx
> +        shr     $2,%ecx
>          xor     %eax,%eax
> -        rep     stosb
> +        rep     stosl
>  
>          /* Interrogate CPU extended features via CPUID. */
>          mov     $0x80000000,%eax
> diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
> index 961f48f..6802da1 100644
> --- a/xen/arch/x86/xen.lds.S
> +++ b/xen/arch/x86/xen.lds.S
> @@ -191,6 +191,8 @@ SECTIONS
>         CONSTRUCTORS
>    } :text
>  
> +  /* Align BSS to speedup its initialization. */
> +  . = ALIGN(4);
>    .bss : {                     /* BSS */
>         . = ALIGN(STACK_SIZE);
>         __bss_start = .;
> @@ -205,6 +207,7 @@ SECTIONS
>         *(.bss.percpu.read_mostly)
>         . = ALIGN(SMP_CACHE_BYTES);
>         __per_cpu_data_end = .;
> +       . = ALIGN(4);
>         __bss_end = .;
>    } :text
>    _end = . ;
> -- 
> 1.7.10.4
>
Andrew Cooper April 15, 2016, 3:48 p.m. UTC | #2
On 15/04/16 13:33, Daniel Kiper wrote:
> Speedup BSS initialization by using stosl instead of stosb.
>
> Some may argue that Intel Ivy Bridge and later provide ERMSB feature.
> This means that "rep stosb" gives better throughput than "rep stosl" on
> above mentioned CPUs. However, this feature is only available on newer
> Intel processors and e.g. AMD does not provide it at all. So, stosb will
> just give real benefits and even beat stosl only on limited number of
> machines. On the other hand stosl will speedup BSS initialization on
> all x86 platforms. Hence, use stosl instead of stosb.
>
> Additionally, align relevant comment to coding style.
>
> Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Signed-off-by: Daniel Kiper <daniel.kiper@oracle.com>
> ---
> v3 - suggestions/fixes:
>    - improve comments
>      (suggested by Konrad Rzeszutek Wilk),
>    - improve commit message
>      (suggested by Jan Beulich).
> ---
>  xen/arch/x86/boot/head.S |    5 +++--
>  xen/arch/x86/xen.lds.S   |    3 +++
>  2 files changed, 6 insertions(+), 2 deletions(-)
>
> diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S
> index f3501fd..32a54a0 100644
> --- a/xen/arch/x86/boot/head.S
> +++ b/xen/arch/x86/boot/head.S
> @@ -123,12 +123,13 @@ __start:
>          call    reloc
>          mov     %eax,sym_phys(multiboot_ptr)
>  
> -        /* Initialize BSS (no nasty surprises!) */
> +        /* Initialize BSS (no nasty surprises!). */
>          mov     $sym_phys(__bss_start),%edi
>          mov     $sym_phys(__bss_end),%ecx
>          sub     %edi,%ecx
> +        shr     $2,%ecx
>          xor     %eax,%eax
> -        rep     stosb
> +        rep     stosl
>  
>          /* Interrogate CPU extended features via CPUID. */
>          mov     $0x80000000,%eax
> diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
> index 961f48f..6802da1 100644
> --- a/xen/arch/x86/xen.lds.S
> +++ b/xen/arch/x86/xen.lds.S
> @@ -191,6 +191,8 @@ SECTIONS
>         CONSTRUCTORS
>    } :text
>  
> +  /* Align BSS to speedup its initialization. */
> +  . = ALIGN(4);

This is not needed.  There is already appropriate alignment before
__bss_start.

Also, you need to rebase this series onto staging - there are a lot of
changes you are missing.

~Andrew

>    .bss : {                     /* BSS */
>         . = ALIGN(STACK_SIZE);
>         __bss_start = .;
> @@ -205,6 +207,7 @@ SECTIONS
>         *(.bss.percpu.read_mostly)
>         . = ALIGN(SMP_CACHE_BYTES);
>         __per_cpu_data_end = .;
> +       . = ALIGN(4);
>         __bss_end = .;
>    } :text
>    _end = . ;
diff mbox

Patch

diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S
index f3501fd..32a54a0 100644
--- a/xen/arch/x86/boot/head.S
+++ b/xen/arch/x86/boot/head.S
@@ -123,12 +123,13 @@  __start:
         call    reloc
         mov     %eax,sym_phys(multiboot_ptr)
 
-        /* Initialize BSS (no nasty surprises!) */
+        /* Initialize BSS (no nasty surprises!). */
         mov     $sym_phys(__bss_start),%edi
         mov     $sym_phys(__bss_end),%ecx
         sub     %edi,%ecx
+        shr     $2,%ecx
         xor     %eax,%eax
-        rep     stosb
+        rep     stosl
 
         /* Interrogate CPU extended features via CPUID. */
         mov     $0x80000000,%eax
diff --git a/xen/arch/x86/xen.lds.S b/xen/arch/x86/xen.lds.S
index 961f48f..6802da1 100644
--- a/xen/arch/x86/xen.lds.S
+++ b/xen/arch/x86/xen.lds.S
@@ -191,6 +191,8 @@  SECTIONS
        CONSTRUCTORS
   } :text
 
+  /* Align BSS to speedup its initialization. */
+  . = ALIGN(4);
   .bss : {                     /* BSS */
        . = ALIGN(STACK_SIZE);
        __bss_start = .;
@@ -205,6 +207,7 @@  SECTIONS
        *(.bss.percpu.read_mostly)
        . = ALIGN(SMP_CACHE_BYTES);
        __per_cpu_data_end = .;
+       . = ALIGN(4);
        __bss_end = .;
   } :text
   _end = . ;