diff mbox series

[XEN,1/9] x86/boot: choose AP stack based on APIC ID

Message ID 0e7dd957b6f26fa7b752bdce1ef6ebe97c825903.1699982111.git.krystian.hebel@3mdeb.com (mailing list archive)
State New, archived
Headers show
Series x86: parallelize AP bring-up during boot | expand

Commit Message

Krystian Hebel Nov. 14, 2023, 5:49 p.m. UTC
This is made as first step of making parallel AP bring-up possible. It
should be enough for pre-C code.

Signed-off-by: Krystian Hebel <krystian.hebel@3mdeb.com>
---
 xen/arch/x86/boot/trampoline.S | 20 ++++++++++++++++++++
 xen/arch/x86/boot/x86_64.S     | 28 +++++++++++++++++++++++++++-
 xen/arch/x86/setup.c           |  7 +++++++
 3 files changed, 54 insertions(+), 1 deletion(-)

Comments

Julien Grall Jan. 26, 2024, 6:30 p.m. UTC | #1
Hi,

I am not too familiary with the x86 boot code. But I will give a try to 
review :).

On 14/11/2023 17:49, Krystian Hebel wrote:
> This is made as first step of making parallel AP bring-up possible. It
> should be enough for pre-C code.
> 
> Signed-off-by: Krystian Hebel <krystian.hebel@3mdeb.com>
> ---
>   xen/arch/x86/boot/trampoline.S | 20 ++++++++++++++++++++
>   xen/arch/x86/boot/x86_64.S     | 28 +++++++++++++++++++++++++++-
>   xen/arch/x86/setup.c           |  7 +++++++
>   3 files changed, 54 insertions(+), 1 deletion(-)
> 
> diff --git a/xen/arch/x86/boot/trampoline.S b/xen/arch/x86/boot/trampoline.S
> index b8ab0ffdcbb0..ec254125016d 100644
> --- a/xen/arch/x86/boot/trampoline.S
> +++ b/xen/arch/x86/boot/trampoline.S
> @@ -72,6 +72,26 @@ trampoline_protmode_entry:
>           mov     $X86_CR4_PAE,%ecx
>           mov     %ecx,%cr4
>   
> +        /*
> +         * Get APIC ID while we're in non-paged mode. Start by checking if
> +         * x2APIC is enabled.
> +         */
> +        mov     $MSR_APIC_BASE, %ecx
> +        rdmsr
> +        and     $APIC_BASE_EXTD, %eax
> +        jnz     .Lx2apic
> +
> +        /* Not x2APIC, read from MMIO */
> +        mov     0xfee00020, %esp
> +        shr     $24, %esp
> +        jmp     1f
> +
> +.Lx2apic:
> +        mov     $(MSR_X2APIC_FIRST + (0x20 >> 4)), %ecx
> +        rdmsr
> +        mov     %eax, %esp
> +1:
> +
>           /* Load pagetable base register. */
>           mov     $sym_offs(idle_pg_table),%eax
>           add     bootsym_rel(trampoline_xen_phys_start,4,%eax)
> diff --git a/xen/arch/x86/boot/x86_64.S b/xen/arch/x86/boot/x86_64.S
> index 04bb62ae8680..b85b47b5c1a0 100644
> --- a/xen/arch/x86/boot/x86_64.S
> +++ b/xen/arch/x86/boot/x86_64.S
> @@ -15,7 +15,33 @@ ENTRY(__high_start)
>           mov     $XEN_MINIMAL_CR4,%rcx
>           mov     %rcx,%cr4
>   
> -        mov     stack_start(%rip),%rsp
> +        test    %ebx,%ebx
> +        cmovz   stack_start(%rip), %rsp
> +        jz      .L_stack_set
> +
> +        /* APs only: get stack base from APIC ID saved in %esp. */
> +        mov     $-1, %rax
> +        lea     x86_cpu_to_apicid(%rip), %rcx
I would consider to move this patch after #2 and #3, so the logic is not 
modified again. This would help the review.

> +1:
> +        add     $1, %rax
> +        cmp     $NR_CPUS, %eax
> +        jb      2f
I think we can get rid of this jump by reworking the loop so %eax is 
tested as the end of the loop. But this is boot code, so it is possibly 
not worth it. I will leave the x86 maintainers commenting.

> +        hlt
> +2:
> +        cmp     %esp, (%rcx, %rax, 4)
> +        jne     1b
> +
> +        /* %eax is now Xen CPU index. */
> +        lea     stack_base(%rip), %rcx
> +        mov     (%rcx, %rax, 8), %rsp
> +
> +        test    %rsp,%rsp
> +        jnz     1f
> +        hlt
> +1:
NIT: Can you use 3? This makes the code easier to read and less prone to 
error (you have two very close 1).

> +        add     $(STACK_SIZE - CPUINFO_sizeof), %rsp
> +
> +.L_stack_set:
>   
>           /* Reset EFLAGS (subsumes CLI and CLD). */
>           pushq   $0
> diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
> index a3d3f797bb1e..1285969901e0 100644
> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -1951,6 +1951,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>        */
>       if ( !pv_shim )
>       {
> +        /* Separate loop to make parallel AP bringup possible. */

The loop split seems to be unrelated to this patch. Actually, I was 
expecting that only the assembly code would be modified.

>           for_each_present_cpu ( i )
>           {
>               /* Set up cpu_to_node[]. */
> @@ -1958,6 +1959,12 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>               /* Set up node_to_cpumask based on cpu_to_node[]. */
>               numa_add_cpu(i);
>   
> +            if ( stack_base[i] == NULL )
> +                stack_base[i] = cpu_alloc_stack(i);

I don't quite understand this change at least in the context of this 
patch. AFAICT the stack will be currently allocated in 
cpu_smpboot_callback() which is called while the CPU is prepared. So you 
should not need this allocation right now.

Looking at the rest of the series, it seems you allocate the stack 
earlier so you start the CPU bring-up earlier. But they will still be 
held in assembly code until cpu_up() is called.

So effectively, part of the C part of the CPUs bring-up is still 
serialized. Did I understand the logic correctly?

If so, I would suggest to clarify it in the series because this wasn't 
obvious to me (I was expecting start_secondary() would also run in 
parallell).

Regarding the change in setup.c, I think it would make more sense in 
patch #9.

Cheers,
Jan Beulich Feb. 7, 2024, 4:11 p.m. UTC | #2
On 14.11.2023 18:49, Krystian Hebel wrote:
> --- a/xen/arch/x86/boot/trampoline.S
> +++ b/xen/arch/x86/boot/trampoline.S
> @@ -72,6 +72,26 @@ trampoline_protmode_entry:
>          mov     $X86_CR4_PAE,%ecx
>          mov     %ecx,%cr4
>  
> +        /*
> +         * Get APIC ID while we're in non-paged mode. Start by checking if
> +         * x2APIC is enabled.
> +         */
> +        mov     $MSR_APIC_BASE, %ecx
> +        rdmsr
> +        and     $APIC_BASE_EXTD, %eax

You don't use the result, in which case "test" is to be preferred over
"and".

> +        jnz     .Lx2apic
> +
> +        /* Not x2APIC, read from MMIO */
> +        mov     0xfee00020, %esp

Please don't open-code existing constants (APIC_ID here and below,
APIC_DEFAULT_PHYS_BASE just here, and ...

> +        shr     $24, %esp

... a to-be-introduced constant here (for {G,S}ET_xAPIC_ID() to use as
well then). This is the only way of being able to easily identify all
pieces of code accessing the same piece of hardware.

> +        jmp     1f
> +
> +.Lx2apic:
> +        mov     $(MSR_X2APIC_FIRST + (0x20 >> 4)), %ecx
> +        rdmsr
> +        mov     %eax, %esp
> +1:

Overall I'm worried of the use of %esp throughout here.

> --- a/xen/arch/x86/boot/x86_64.S
> +++ b/xen/arch/x86/boot/x86_64.S
> @@ -15,7 +15,33 @@ ENTRY(__high_start)
>          mov     $XEN_MINIMAL_CR4,%rcx
>          mov     %rcx,%cr4
>  
> -        mov     stack_start(%rip),%rsp
> +        test    %ebx,%ebx

Nit (style): Elsewhere you have blanks after the commas, just here
(and once again near the end of the hunk) you don't.

> +        cmovz   stack_start(%rip), %rsp
> +        jz      .L_stack_set
> +
> +        /* APs only: get stack base from APIC ID saved in %esp. */
> +        mov     $-1, %rax

Why a 64-bit insn here and ...

> +        lea     x86_cpu_to_apicid(%rip), %rcx
> +1:
> +        add     $1, %rax

... here, when you only use (far less than) 32-bit values?

> +        cmp     $NR_CPUS, %eax
> +        jb      2f
> +        hlt
> +2:
> +        cmp     %esp, (%rcx, %rax, 4)
> +        jne     1b

Aren't you open-coding REPNE SCASD here?

> +        /* %eax is now Xen CPU index. */
> +        lea     stack_base(%rip), %rcx
> +        mov     (%rcx, %rax, 8), %rsp
> +
> +        test    %rsp,%rsp
> +        jnz     1f
> +        hlt
> +1:
> +        add     $(STACK_SIZE - CPUINFO_sizeof), %rsp

Even this post-adjustment is worrying me. Imo the stack pointer is
better set exactly once, to its final value. Keeping it at its init
value of 0 until then yields more predictable results in case it
ends up being used somewhere.

> --- a/xen/arch/x86/setup.c
> +++ b/xen/arch/x86/setup.c
> @@ -1951,6 +1951,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>       */
>      if ( !pv_shim )
>      {
> +        /* Separate loop to make parallel AP bringup possible. */
>          for_each_present_cpu ( i )
>          {
>              /* Set up cpu_to_node[]. */
> @@ -1958,6 +1959,12 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>              /* Set up node_to_cpumask based on cpu_to_node[]. */
>              numa_add_cpu(i);
>  
> +            if ( stack_base[i] == NULL )
> +                stack_base[i] = cpu_alloc_stack(i);
> +        }

Imo this wants accompanying by removal of the allocation in
cpu_smpboot_alloc(). Which would then make more visible that there's
error checking there, but not here (I realize there effectively is
error checking in assembly code, but that'll end in HLT with no
useful indication of what the problem is). Provided, as Julien has
pointed out, that the change is necessary in the first place.

Jan
Krystian Hebel March 12, 2024, 2:14 p.m. UTC | #3
Hi,

On 26.01.2024 19:30, Julien Grall wrote:
> Hi,
>
> I am not too familiary with the x86 boot code. But I will give a try 
> to review :).
>
> On 14/11/2023 17:49, Krystian Hebel wrote:
>> This is made as first step of making parallel AP bring-up possible. It
>> should be enough for pre-C code.
>>
>> Signed-off-by: Krystian Hebel <krystian.hebel@3mdeb.com>
>> ---
>>   xen/arch/x86/boot/trampoline.S | 20 ++++++++++++++++++++
>>   xen/arch/x86/boot/x86_64.S     | 28 +++++++++++++++++++++++++++-
>>   xen/arch/x86/setup.c           |  7 +++++++
>>   3 files changed, 54 insertions(+), 1 deletion(-)
>>
>> diff --git a/xen/arch/x86/boot/trampoline.S 
>> b/xen/arch/x86/boot/trampoline.S
>> index b8ab0ffdcbb0..ec254125016d 100644
>> --- a/xen/arch/x86/boot/trampoline.S
>> +++ b/xen/arch/x86/boot/trampoline.S
>> @@ -72,6 +72,26 @@ trampoline_protmode_entry:
>>           mov     $X86_CR4_PAE,%ecx
>>           mov     %ecx,%cr4
>>   +        /*
>> +         * Get APIC ID while we're in non-paged mode. Start by 
>> checking if
>> +         * x2APIC is enabled.
>> +         */
>> +        mov     $MSR_APIC_BASE, %ecx
>> +        rdmsr
>> +        and     $APIC_BASE_EXTD, %eax
>> +        jnz     .Lx2apic
>> +
>> +        /* Not x2APIC, read from MMIO */
>> +        mov     0xfee00020, %esp
>> +        shr     $24, %esp
>> +        jmp     1f
>> +
>> +.Lx2apic:
>> +        mov     $(MSR_X2APIC_FIRST + (0x20 >> 4)), %ecx
>> +        rdmsr
>> +        mov     %eax, %esp
>> +1:
>> +
>>           /* Load pagetable base register. */
>>           mov     $sym_offs(idle_pg_table),%eax
>>           add     bootsym_rel(trampoline_xen_phys_start,4,%eax)
>> diff --git a/xen/arch/x86/boot/x86_64.S b/xen/arch/x86/boot/x86_64.S
>> index 04bb62ae8680..b85b47b5c1a0 100644
>> --- a/xen/arch/x86/boot/x86_64.S
>> +++ b/xen/arch/x86/boot/x86_64.S
>> @@ -15,7 +15,33 @@ ENTRY(__high_start)
>>           mov     $XEN_MINIMAL_CR4,%rcx
>>           mov     %rcx,%cr4
>>   -        mov     stack_start(%rip),%rsp
>> +        test    %ebx,%ebx
>> +        cmovz   stack_start(%rip), %rsp
>> +        jz      .L_stack_set
>> +
>> +        /* APs only: get stack base from APIC ID saved in %esp. */
>> +        mov     $-1, %rax
>> +        lea     x86_cpu_to_apicid(%rip), %rcx
> I would consider to move this patch after #2 and #3, so the logic is 
> not modified again. This would help the review.
I agree, maybe even after #4 after that patch is modified according to 
other comments.
>
>> +1:
>> +        add     $1, %rax
>> +        cmp     $NR_CPUS, %eax
>> +        jb      2f
> I think we can get rid of this jump by reworking the loop so %eax is 
> tested as the end of the loop. But this is boot code, so it is 
> possibly not worth it. I will leave the x86 maintainers commenting.
Not sure if I understood "end of the loop" correctly, but if I did, it 
would result in out-of-bounds read. Anyway, this is changed by further 
patches which I still have to reorder, I'll check if final form can be 
improved.
>
>> +        hlt
>> +2:
>> +        cmp     %esp, (%rcx, %rax, 4)
>> +        jne     1b
>> +
>> +        /* %eax is now Xen CPU index. */
>> +        lea     stack_base(%rip), %rcx
>> +        mov     (%rcx, %rax, 8), %rsp
>> +
>> +        test    %rsp,%rsp
>> +        jnz     1f
>> +        hlt
>> +1:
> NIT: Can you use 3? This makes the code easier to read and less prone 
> to error (you have two very close 1).
Ack
>
>> +        add     $(STACK_SIZE - CPUINFO_sizeof), %rsp
>> +
>> +.L_stack_set:
>>             /* Reset EFLAGS (subsumes CLI and CLD). */
>>           pushq   $0
>> diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
>> index a3d3f797bb1e..1285969901e0 100644
>> --- a/xen/arch/x86/setup.c
>> +++ b/xen/arch/x86/setup.c
>> @@ -1951,6 +1951,7 @@ void __init noreturn __start_xen(unsigned long 
>> mbi_p)
>>        */
>>       if ( !pv_shim )
>>       {
>> +        /* Separate loop to make parallel AP bringup possible. */
>
> The loop split seems to be unrelated to this patch. Actually, I was 
> expecting that only the assembly code would be modified.
Fair point, I originally left it here so the code can be bisected if 
needed, but code changed significantly since then. In current form it 
should be safe to do this in the last commit.
>
>>           for_each_present_cpu ( i )
>>           {
>>               /* Set up cpu_to_node[]. */
>> @@ -1958,6 +1959,12 @@ void __init noreturn __start_xen(unsigned long 
>> mbi_p)
>>               /* Set up node_to_cpumask based on cpu_to_node[]. */
>>               numa_add_cpu(i);
>>   +            if ( stack_base[i] == NULL )
>> +                stack_base[i] = cpu_alloc_stack(i);
>
> I don't quite understand this change at least in the context of this 
> patch. AFAICT the stack will be currently allocated in 
> cpu_smpboot_callback() which is called while the CPU is prepared. So 
> you should not need this allocation right now.
>
> Looking at the rest of the series, it seems you allocate the stack 
> earlier so you start the CPU bring-up earlier. But they will still be 
> held in assembly code until cpu_up() is called.
>
> So effectively, part of the C part of the CPUs bring-up is still 
> serialized. Did I understand the logic correctly?
>
> If so, I would suggest to clarify it in the series because this wasn't 
> obvious to me (I was expecting start_secondary() would also run in 
> parallell).

start_secondary() is started in parallel, CPUs are not held in assembly. 
Calling C (almost) always requires stack, and most of this series comes 
to making stack available for all APs at once, just so APs can loop 
early in start_secondary().

You are correct, most of C part is serialized. I tried to make it 
parallel as well but quickly gave up. Some of the notifiers callbacks 
are resistant against any attempts at parallelization, and this set of 
patches already gave satisfactory improvements in boot time (and was 
enough to go through peculiar SMP bring-up used by Intel TXT dynamic 
launch, which is the reason why I had to do it in the first place).

>
> Regarding the change in setup.c, I think it would make more sense in 
> patch #9.
>
> Cheers,
>
Best regards,
Krystian Hebel March 12, 2024, 3:11 p.m. UTC | #4
On 7.02.2024 17:11, Jan Beulich wrote:
> On 14.11.2023 18:49, Krystian Hebel wrote:
>> --- a/xen/arch/x86/boot/trampoline.S
>> +++ b/xen/arch/x86/boot/trampoline.S
>> @@ -72,6 +72,26 @@ trampoline_protmode_entry:
>>           mov     $X86_CR4_PAE,%ecx
>>           mov     %ecx,%cr4
>>   
>> +        /*
>> +         * Get APIC ID while we're in non-paged mode. Start by checking if
>> +         * x2APIC is enabled.
>> +         */
>> +        mov     $MSR_APIC_BASE, %ecx
>> +        rdmsr
>> +        and     $APIC_BASE_EXTD, %eax
> You don't use the result, in which case "test" is to be preferred over
> "and".
Ack
>
>> +        jnz     .Lx2apic
>> +
>> +        /* Not x2APIC, read from MMIO */
>> +        mov     0xfee00020, %esp
> Please don't open-code existing constants (APIC_ID here and below,
> APIC_DEFAULT_PHYS_BASE just here, and ...
>
>> +        shr     $24, %esp
> ... a to-be-introduced constant here (for {G,S}ET_xAPIC_ID() to use as
> well then). This is the only way of being able to easily identify all
> pieces of code accessing the same piece of hardware.

Yes, this was also caught in review done by Qubes OS team.

New constant and {G,S}ET_xAPIC_ID() should be in separate patch, I presume?

>> +        jmp     1f
>> +
>> +.Lx2apic:
>> +        mov     $(MSR_X2APIC_FIRST + (0x20 >> 4)), %ecx
>> +        rdmsr
>> +        mov     %eax, %esp
>> +1:
> Overall I'm worried of the use of %esp throughout here.
>
>> --- a/xen/arch/x86/boot/x86_64.S
>> +++ b/xen/arch/x86/boot/x86_64.S
>> @@ -15,7 +15,33 @@ ENTRY(__high_start)
>>           mov     $XEN_MINIMAL_CR4,%rcx
>>           mov     %rcx,%cr4
>>   
>> -        mov     stack_start(%rip),%rsp
>> +        test    %ebx,%ebx
> Nit (style): Elsewhere you have blanks after the commas, just here
> (and once again near the end of the hunk) you don't.
Is either style preferred?This file has both.
>> +        cmovz   stack_start(%rip), %rsp
>> +        jz      .L_stack_set
>> +
>> +        /* APs only: get stack base from APIC ID saved in %esp. */
>> +        mov     $-1, %rax
> Why a 64-bit insn here and ...
>
>> +        lea     x86_cpu_to_apicid(%rip), %rcx
>> +1:
>> +        add     $1, %rax
> ... here, when you only use (far less than) 32-bit values?
Fair question, no idea what I had in mind, will change.
>> +        cmp     $NR_CPUS, %eax
>> +        jb      2f
>> +        hlt
>> +2:
>> +        cmp     %esp, (%rcx, %rax, 4)
>> +        jne     1b
> Aren't you open-coding REPNE SCASD here?
Looks like I am, I missed that. I'm not sure if this will still apply after
changes from further patches, but I'll keep that in mind.
>
>> +        /* %eax is now Xen CPU index. */
>> +        lea     stack_base(%rip), %rcx
>> +        mov     (%rcx, %rax, 8), %rsp
>> +
>> +        test    %rsp,%rsp
>> +        jnz     1f
>> +        hlt
>> +1:
>> +        add     $(STACK_SIZE - CPUINFO_sizeof), %rsp
> Even this post-adjustment is worrying me. Imo the stack pointer is
> better set exactly once, to its final value. Keeping it at its init
> value of 0 until then yields more predictable results in case it
> ends up being used somewhere.
It really shouldn't be used anywhere, but I'll change it.
>> --- a/xen/arch/x86/setup.c
>> +++ b/xen/arch/x86/setup.c
>> @@ -1951,6 +1951,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>>        */
>>       if ( !pv_shim )
>>       {
>> +        /* Separate loop to make parallel AP bringup possible. */
>>           for_each_present_cpu ( i )
>>           {
>>               /* Set up cpu_to_node[]. */
>> @@ -1958,6 +1959,12 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>>               /* Set up node_to_cpumask based on cpu_to_node[]. */
>>               numa_add_cpu(i);
>>   
>> +            if ( stack_base[i] == NULL )
>> +                stack_base[i] = cpu_alloc_stack(i);
>> +        }
> Imo this wants accompanying by removal of the allocation in
> cpu_smpboot_alloc(). Which would then make more visible that there's
> error checking there, but not here (I realize there effectively is
> error checking in assembly code, but that'll end in HLT with no
> useful indication of what the problem is). Provided, as Julien has
> pointed out, that the change is necessary in the first place.

The allocation in cpu_smpboot_alloc() was left for hot-plug. This loops
over present CPUs, not possible ones.

>
> Jan
Best regards,
Jan Beulich March 12, 2024, 3:40 p.m. UTC | #5
On 12.03.2024 16:11, Krystian Hebel wrote:
> On 7.02.2024 17:11, Jan Beulich wrote:
>> On 14.11.2023 18:49, Krystian Hebel wrote:
>>> --- a/xen/arch/x86/boot/trampoline.S
>>> +++ b/xen/arch/x86/boot/trampoline.S
>>> +        /* Not x2APIC, read from MMIO */
>>> +        mov     0xfee00020, %esp
>> Please don't open-code existing constants (APIC_ID here and below,
>> APIC_DEFAULT_PHYS_BASE just here, and ...
>>
>>> +        shr     $24, %esp
>> ... a to-be-introduced constant here (for {G,S}ET_xAPIC_ID() to use as
>> well then). This is the only way of being able to easily identify all
>> pieces of code accessing the same piece of hardware.
> 
> Yes, this was also caught in review done by Qubes OS team.
> 
> New constant and {G,S}ET_xAPIC_ID() should be in separate patch, I presume?

Preferably, yes.

>>> --- a/xen/arch/x86/boot/x86_64.S
>>> +++ b/xen/arch/x86/boot/x86_64.S
>>> @@ -15,7 +15,33 @@ ENTRY(__high_start)
>>>           mov     $XEN_MINIMAL_CR4,%rcx
>>>           mov     %rcx,%cr4
>>>   
>>> -        mov     stack_start(%rip),%rsp
>>> +        test    %ebx,%ebx
>> Nit (style): Elsewhere you have blanks after the commas, just here
>> (and once again near the end of the hunk) you don't.
> Is either style preferred?This file has both.

Conversion takes time, so in new code we aim at having those blanks.
Over time we hope to have them nearly everywhere, at which point it
may make sense to to a final cleanup sweep.

>>> --- a/xen/arch/x86/setup.c
>>> +++ b/xen/arch/x86/setup.c
>>> @@ -1951,6 +1951,7 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>>>        */
>>>       if ( !pv_shim )
>>>       {
>>> +        /* Separate loop to make parallel AP bringup possible. */
>>>           for_each_present_cpu ( i )
>>>           {
>>>               /* Set up cpu_to_node[]. */
>>> @@ -1958,6 +1959,12 @@ void __init noreturn __start_xen(unsigned long mbi_p)
>>>               /* Set up node_to_cpumask based on cpu_to_node[]. */
>>>               numa_add_cpu(i);
>>>   
>>> +            if ( stack_base[i] == NULL )
>>> +                stack_base[i] = cpu_alloc_stack(i);
>>> +        }
>> Imo this wants accompanying by removal of the allocation in
>> cpu_smpboot_alloc(). Which would then make more visible that there's
>> error checking there, but not here (I realize there effectively is
>> error checking in assembly code, but that'll end in HLT with no
>> useful indication of what the problem is). Provided, as Julien has
>> pointed out, that the change is necessary in the first place.
> 
> The allocation in cpu_smpboot_alloc() was left for hot-plug. This loops
> over present CPUs, not possible ones.

Ah, right. Yet better error checking / reporting is going to be needed
anyway.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/boot/trampoline.S b/xen/arch/x86/boot/trampoline.S
index b8ab0ffdcbb0..ec254125016d 100644
--- a/xen/arch/x86/boot/trampoline.S
+++ b/xen/arch/x86/boot/trampoline.S
@@ -72,6 +72,26 @@  trampoline_protmode_entry:
         mov     $X86_CR4_PAE,%ecx
         mov     %ecx,%cr4
 
+        /*
+         * Get APIC ID while we're in non-paged mode. Start by checking if
+         * x2APIC is enabled.
+         */
+        mov     $MSR_APIC_BASE, %ecx
+        rdmsr
+        and     $APIC_BASE_EXTD, %eax
+        jnz     .Lx2apic
+
+        /* Not x2APIC, read from MMIO */
+        mov     0xfee00020, %esp
+        shr     $24, %esp
+        jmp     1f
+
+.Lx2apic:
+        mov     $(MSR_X2APIC_FIRST + (0x20 >> 4)), %ecx
+        rdmsr
+        mov     %eax, %esp
+1:
+
         /* Load pagetable base register. */
         mov     $sym_offs(idle_pg_table),%eax
         add     bootsym_rel(trampoline_xen_phys_start,4,%eax)
diff --git a/xen/arch/x86/boot/x86_64.S b/xen/arch/x86/boot/x86_64.S
index 04bb62ae8680..b85b47b5c1a0 100644
--- a/xen/arch/x86/boot/x86_64.S
+++ b/xen/arch/x86/boot/x86_64.S
@@ -15,7 +15,33 @@  ENTRY(__high_start)
         mov     $XEN_MINIMAL_CR4,%rcx
         mov     %rcx,%cr4
 
-        mov     stack_start(%rip),%rsp
+        test    %ebx,%ebx
+        cmovz   stack_start(%rip), %rsp
+        jz      .L_stack_set
+
+        /* APs only: get stack base from APIC ID saved in %esp. */
+        mov     $-1, %rax
+        lea     x86_cpu_to_apicid(%rip), %rcx
+1:
+        add     $1, %rax
+        cmp     $NR_CPUS, %eax
+        jb      2f
+        hlt
+2:
+        cmp     %esp, (%rcx, %rax, 4)
+        jne     1b
+
+        /* %eax is now Xen CPU index. */
+        lea     stack_base(%rip), %rcx
+        mov     (%rcx, %rax, 8), %rsp
+
+        test    %rsp,%rsp
+        jnz     1f
+        hlt
+1:
+        add     $(STACK_SIZE - CPUINFO_sizeof), %rsp
+
+.L_stack_set:
 
         /* Reset EFLAGS (subsumes CLI and CLD). */
         pushq   $0
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index a3d3f797bb1e..1285969901e0 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1951,6 +1951,7 @@  void __init noreturn __start_xen(unsigned long mbi_p)
      */
     if ( !pv_shim )
     {
+        /* Separate loop to make parallel AP bringup possible. */
         for_each_present_cpu ( i )
         {
             /* Set up cpu_to_node[]. */
@@ -1958,6 +1959,12 @@  void __init noreturn __start_xen(unsigned long mbi_p)
             /* Set up node_to_cpumask based on cpu_to_node[]. */
             numa_add_cpu(i);
 
+            if ( stack_base[i] == NULL )
+                stack_base[i] = cpu_alloc_stack(i);
+        }
+
+        for_each_present_cpu ( i )
+        {
             if ( (park_offline_cpus || num_online_cpus() < max_cpus) &&
                  !cpu_online(i) )
             {