diff mbox series

[XEN,8/9] x86/smp: make cpu_state per-CPU

Message ID 52083114d4cbbc75f021e8c61763ad0e166cf05b.1699982111.git.krystian.hebel@3mdeb.com (mailing list archive)
State New, archived
Headers show
Series x86: parallelize AP bring-up during boot | expand

Commit Message

Krystian Hebel Nov. 14, 2023, 5:50 p.m. UTC
This will be used for parallel AP bring-up.

CPU_STATE_INIT changed direction. It was previously set by BSP and never
consumed by AP. Now it signals that AP got through assembly part of
initialization and waits for BSP to call notifiers that set up data
structures required for further initialization.

Signed-off-by: Krystian Hebel <krystian.hebel@3mdeb.com>
---
 xen/arch/x86/include/asm/cpufeature.h |  1 +
 xen/arch/x86/smpboot.c                | 80 ++++++++++++++++-----------
 2 files changed, 49 insertions(+), 32 deletions(-)

Comments

Jan Beulich Feb. 8, 2024, 12:13 p.m. UTC | #1
On 14.11.2023 18:50, Krystian Hebel wrote:
> This will be used for parallel AP bring-up.
> 
> CPU_STATE_INIT changed direction.

Nit: I think you mean "changes" as you describe what the patch does, not
what has happened before. But ...

> It was previously set by BSP and never
> consumed by AP. Now it signals that AP got through assembly part of
> initialization and waits for BSP to call notifiers that set up data
> structures required for further initialization.

... all of this is, afaict, independent of what the title says the
purpose of this patch is. Since the correctness of the state change
adjustments doesn't look straightforward to prove, please split the
mechanical change from the change to the actual logic.

> --- a/xen/arch/x86/include/asm/cpufeature.h
> +++ b/xen/arch/x86/include/asm/cpufeature.h
> @@ -38,6 +38,7 @@ struct cpuinfo_x86 {
>      unsigned int cpu_core_id;          /* core ID of each logical CPU */
>      unsigned int compute_unit_id;      /* AMD compute unit ID of each logical CPU */
>      void *stack_base;
> +    unsigned int cpu_state;
>      unsigned short x86_clflush_size;
>  } __cacheline_aligned;

Is there any reason this cannot be ordinary per-CPU data?

> --- a/xen/arch/x86/smpboot.c
> +++ b/xen/arch/x86/smpboot.c
> @@ -65,15 +65,18 @@ struct cpuinfo_x86 cpu_data[NR_CPUS] =
>          { [0 ... NR_CPUS-1] .apicid = BAD_APICID };
>  
>  static int cpu_error;
> -static enum cpu_state {
> +enum cpu_state {
>      CPU_STATE_DYING,    /* slave -> master: I am dying */
>      CPU_STATE_DEAD,     /* slave -> master: I am completely dead */
> -    CPU_STATE_INIT,     /* master -> slave: Early bringup phase 1 */
> -    CPU_STATE_CALLOUT,  /* master -> slave: Early bringup phase 2 */
> +    CPU_STATE_INIT,     /* slave -> master: Early bringup phase 1 completed */
> +    CPU_STATE_CALLOUT,  /* master -> slave: Start early bringup phase 2 */

It's not really clear to me whether the adding of "Start" on the 2nd line
really adds value.

>      CPU_STATE_CALLIN,   /* slave -> master: Completed phase 2 */
>      CPU_STATE_ONLINE    /* master -> slave: Go fully online now. */
> -} cpu_state;
> -#define set_cpu_state(state) do { smp_mb(); cpu_state = (state); } while (0)
> +};
> +#define set_cpu_state(cpu, state) do { \
> +    smp_mb(); \
> +    cpu_data[cpu].cpu_state = (state); \
> +} while (0)

While you merely re-arrange it, I'd still like to ask: Does this really
need to be smp_mb(), not just smp_wmb()?

> @@ -320,6 +317,10 @@ void start_secondary(unsigned int cpu)
>  
>      /* Critical region without IDT or TSS.  Any fault is deadly! */
>  
> +    /* Wait until data set up by CPU_UP_PREPARE notifiers is ready. */
> +    while ( cpu_data[cpu].cpu_state != CPU_STATE_CALLOUT )
> +        cpu_relax();

I'm afraid I don't understand the comment (and hence whether this loop
is actually needed here): __cpu_up() is called only after those
notifiers completed.

> @@ -1161,6 +1171,12 @@ void __init smp_prepare_cpus(void)
>      cpu_data[0].stack_base = (void *)
>               ((unsigned long)stack_start & ~(STACK_SIZE - 1));
>  
> +    /* Set state as CALLOUT so APs won't change it in initialize_cpu_data() */
> +    boot_cpu_data.cpu_state = CPU_STATE_CALLOUT;

This is actually one of the reasons I don't like you putting the item
as a new field in struct cpuinfo_x86. Otherwise imo initialize_cpu_data()
ought to gain a respective assertion.

Jan
Krystian Hebel March 12, 2024, 4:38 p.m. UTC | #2
On 8.02.2024 13:13, Jan Beulich wrote:
> On 14.11.2023 18:50, Krystian Hebel wrote:
>> This will be used for parallel AP bring-up.
>>
>> CPU_STATE_INIT changed direction.
> Nit: I think you mean "changes" as you describe what the patch does, not
> what has happened before. But ...
>
>> It was previously set by BSP and never
>> consumed by AP. Now it signals that AP got through assembly part of
>> initialization and waits for BSP to call notifiers that set up data
>> structures required for further initialization.
> ... all of this is, afaict, independent of what the title says the
> purpose of this patch is. Since the correctness of the state change
> adjustments doesn't look straightforward to prove, please split the
> mechanical change from the change to the actual logic.
Ack
>
>> --- a/xen/arch/x86/include/asm/cpufeature.h
>> +++ b/xen/arch/x86/include/asm/cpufeature.h
>> @@ -38,6 +38,7 @@ struct cpuinfo_x86 {
>>       unsigned int cpu_core_id;          /* core ID of each logical CPU */
>>       unsigned int compute_unit_id;      /* AMD compute unit ID of each logical CPU */
>>       void *stack_base;
>> +    unsigned int cpu_state;
>>       unsigned short x86_clflush_size;
>>   } __cacheline_aligned;
> Is there any reason this cannot be ordinary per-CPU data?
Probably not, will move it away.
>
>> --- a/xen/arch/x86/smpboot.c
>> +++ b/xen/arch/x86/smpboot.c
>> @@ -65,15 +65,18 @@ struct cpuinfo_x86 cpu_data[NR_CPUS] =
>>           { [0 ... NR_CPUS-1] .apicid = BAD_APICID };
>>   
>>   static int cpu_error;
>> -static enum cpu_state {
>> +enum cpu_state {
>>       CPU_STATE_DYING,    /* slave -> master: I am dying */
>>       CPU_STATE_DEAD,     /* slave -> master: I am completely dead */
>> -    CPU_STATE_INIT,     /* master -> slave: Early bringup phase 1 */
>> -    CPU_STATE_CALLOUT,  /* master -> slave: Early bringup phase 2 */
>> +    CPU_STATE_INIT,     /* slave -> master: Early bringup phase 1 completed */
>> +    CPU_STATE_CALLOUT,  /* master -> slave: Start early bringup phase 2 */
> It's not really clear to me whether the adding of "Start" on the 2nd line
> really adds value.
Ack
>
>>       CPU_STATE_CALLIN,   /* slave -> master: Completed phase 2 */
>>       CPU_STATE_ONLINE    /* master -> slave: Go fully online now. */
>> -} cpu_state;
>> -#define set_cpu_state(state) do { smp_mb(); cpu_state = (state); } while (0)
>> +};
>> +#define set_cpu_state(cpu, state) do { \
>> +    smp_mb(); \
>> +    cpu_data[cpu].cpu_state = (state); \
>> +} while (0)
> While you merely re-arrange it, I'd still like to ask: Does this really
> need to be smp_mb(), not just smp_wmb()?
Probably not, but I didn't want to change it, assuming there was a reason
that it used smp_wmb() in the first place.
>
>> @@ -320,6 +317,10 @@ void start_secondary(unsigned int cpu)
>>   
>>       /* Critical region without IDT or TSS.  Any fault is deadly! */
>>   
>> +    /* Wait until data set up by CPU_UP_PREPARE notifiers is ready. */
>> +    while ( cpu_data[cpu].cpu_state != CPU_STATE_CALLOUT )
>> +        cpu_relax();
> I'm afraid I don't understand the comment (and hence whether this loop
> is actually needed here): __cpu_up() is called only after those
> notifiers completed.
Yes, but broadcasted INIT-SIPI-SIPI sequence added in next patch will be
sent before that call is made, and consequently APs potentially can get
to this point before that data is set up.
>
>> @@ -1161,6 +1171,12 @@ void __init smp_prepare_cpus(void)
>>       cpu_data[0].stack_base = (void *)
>>                ((unsigned long)stack_start & ~(STACK_SIZE - 1));
>>   
>> +    /* Set state as CALLOUT so APs won't change it in initialize_cpu_data() */
>> +    boot_cpu_data.cpu_state = CPU_STATE_CALLOUT;
> This is actually one of the reasons I don't like you putting the item
> as a new field in struct cpuinfo_x86. Otherwise imo initialize_cpu_data()
> ought to gain a respective assertion.
I'll move it out.
>
> Jan
Best regards,
Jan Beulich March 13, 2024, 1:21 p.m. UTC | #3
On 12.03.2024 17:38, Krystian Hebel wrote:
> On 8.02.2024 13:13, Jan Beulich wrote:
>> On 14.11.2023 18:50, Krystian Hebel wrote:
>>> @@ -320,6 +317,10 @@ void start_secondary(unsigned int cpu)
>>>   
>>>       /* Critical region without IDT or TSS.  Any fault is deadly! */
>>>   
>>> +    /* Wait until data set up by CPU_UP_PREPARE notifiers is ready. */
>>> +    while ( cpu_data[cpu].cpu_state != CPU_STATE_CALLOUT )
>>> +        cpu_relax();
>> I'm afraid I don't understand the comment (and hence whether this loop
>> is actually needed here): __cpu_up() is called only after those
>> notifiers completed.
> Yes, but broadcasted INIT-SIPI-SIPI sequence added in next patch will be
> sent before that call is made, and consequently APs potentially can get
> to this point before that data is set up.

That's fine, and I was able to conclude this once having read that following
patch. But the patch here, including its description, wants to the self-
contained.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/include/asm/cpufeature.h b/xen/arch/x86/include/asm/cpufeature.h
index ff0e18864cc7..1831b5fb368f 100644
--- a/xen/arch/x86/include/asm/cpufeature.h
+++ b/xen/arch/x86/include/asm/cpufeature.h
@@ -38,6 +38,7 @@  struct cpuinfo_x86 {
     unsigned int cpu_core_id;          /* core ID of each logical CPU */
     unsigned int compute_unit_id;      /* AMD compute unit ID of each logical CPU */
     void *stack_base;
+    unsigned int cpu_state;
     unsigned short x86_clflush_size;
 } __cacheline_aligned;
 
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 39ffd356dbbc..cbea2d45f70d 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -65,15 +65,18 @@  struct cpuinfo_x86 cpu_data[NR_CPUS] =
         { [0 ... NR_CPUS-1] .apicid = BAD_APICID };
 
 static int cpu_error;
-static enum cpu_state {
+enum cpu_state {
     CPU_STATE_DYING,    /* slave -> master: I am dying */
     CPU_STATE_DEAD,     /* slave -> master: I am completely dead */
-    CPU_STATE_INIT,     /* master -> slave: Early bringup phase 1 */
-    CPU_STATE_CALLOUT,  /* master -> slave: Early bringup phase 2 */
+    CPU_STATE_INIT,     /* slave -> master: Early bringup phase 1 completed */
+    CPU_STATE_CALLOUT,  /* master -> slave: Start early bringup phase 2 */
     CPU_STATE_CALLIN,   /* slave -> master: Completed phase 2 */
     CPU_STATE_ONLINE    /* master -> slave: Go fully online now. */
-} cpu_state;
-#define set_cpu_state(state) do { smp_mb(); cpu_state = (state); } while (0)
+};
+#define set_cpu_state(cpu, state) do { \
+    smp_mb(); \
+    cpu_data[cpu].cpu_state = (state); \
+} while (0)
 
 void initialize_cpu_data(unsigned int cpu)
 {
@@ -168,16 +171,7 @@  static void synchronize_tsc_slave(unsigned int slave)
 static void smp_callin(void)
 {
     unsigned int cpu = smp_processor_id();
-    int i, rc;
-
-    /* Wait 2s total for startup. */
-    Dprintk("Waiting for CALLOUT.\n");
-    for ( i = 0; cpu_state != CPU_STATE_CALLOUT; i++ )
-    {
-        BUG_ON(i >= 200);
-        cpu_relax();
-        mdelay(10);
-    }
+    int rc;
 
     /*
      * The boot CPU has finished the init stage and is spinning on cpu_state
@@ -213,12 +207,12 @@  static void smp_callin(void)
     }
 
     /* Allow the master to continue. */
-    set_cpu_state(CPU_STATE_CALLIN);
+    set_cpu_state(cpu, CPU_STATE_CALLIN);
 
     synchronize_tsc_slave(cpu);
 
     /* And wait for our final Ack. */
-    while ( cpu_state != CPU_STATE_ONLINE )
+    while ( cpu_data[cpu].cpu_state != CPU_STATE_ONLINE )
         cpu_relax();
 }
 
@@ -313,6 +307,9 @@  void start_secondary(unsigned int cpu)
 {
     struct cpu_info *info = get_cpu_info();
 
+    /* Tell BSP that we are awake. */
+    set_cpu_state(cpu, CPU_STATE_INIT);
+
     /*
      * Don't put anything before smp_callin(), SMP booting is so fragile that we
      * want to limit the things done here to the most necessary things.
@@ -320,6 +317,10 @@  void start_secondary(unsigned int cpu)
 
     /* Critical region without IDT or TSS.  Any fault is deadly! */
 
+    /* Wait until data set up by CPU_UP_PREPARE notifiers is ready. */
+    while ( cpu_data[cpu].cpu_state != CPU_STATE_CALLOUT )
+        cpu_relax();
+
     set_current(idle_vcpu[cpu]);
     this_cpu(curr_vcpu) = idle_vcpu[cpu];
     rdmsrl(MSR_EFER, this_cpu(efer));
@@ -585,26 +586,35 @@  static int do_boot_cpu(int apicid, int cpu)
 
     /* This grunge runs the startup process for the targeted processor. */
 
-    set_cpu_state(CPU_STATE_INIT);
-
     /* Starting actual IPI sequence... */
     boot_error = wakeup_secondary_cpu(apicid, start_eip);
 
     if ( !boot_error )
     {
-        /* Allow AP to start initializing. */
-        set_cpu_state(CPU_STATE_CALLOUT);
-        Dprintk("After Callout %d.\n", cpu);
-
-        /* Wait 5s total for a response. */
-        for ( timeout = 0; timeout < 50000; timeout++ )
+        /* Wait 2s total for a response. */
+        for ( timeout = 0; timeout < 20000; timeout++ )
         {
-            if ( cpu_state != CPU_STATE_CALLOUT )
+            if ( cpu_data[cpu].cpu_state == CPU_STATE_INIT )
                 break;
             udelay(100);
         }
 
-        if ( cpu_state == CPU_STATE_CALLIN )
+        if ( cpu_data[cpu].cpu_state == CPU_STATE_INIT )
+        {
+            /* Allow AP to start initializing. */
+            set_cpu_state(cpu, CPU_STATE_CALLOUT);
+            Dprintk("After Callout %d.\n", cpu);
+
+            /* Wait 5s total for a response. */
+            for ( timeout = 0; timeout < 500000; timeout++ )
+            {
+                if ( cpu_data[cpu].cpu_state != CPU_STATE_CALLOUT )
+                    break;
+                udelay(10);
+            }
+        }
+
+        if ( cpu_data[cpu].cpu_state == CPU_STATE_CALLIN )
         {
             /* number CPUs logically, starting from 1 (BSP is 0) */
             Dprintk("OK.\n");
@@ -612,7 +622,7 @@  static int do_boot_cpu(int apicid, int cpu)
             synchronize_tsc_master(cpu);
             Dprintk("CPU has booted.\n");
         }
-        else if ( cpu_state == CPU_STATE_DEAD )
+        else if ( cpu_data[cpu].cpu_state == CPU_STATE_DEAD )
         {
             smp_rmb();
             rc = cpu_error;
@@ -683,7 +693,7 @@  unsigned long alloc_stub_page(unsigned int cpu, unsigned long *mfn)
 void cpu_exit_clear(unsigned int cpu)
 {
     cpu_uninit(cpu);
-    set_cpu_state(CPU_STATE_DEAD);
+    set_cpu_state(cpu, CPU_STATE_DEAD);
 }
 
 static int clone_mapping(const void *ptr, root_pgentry_t *rpt)
@@ -1161,6 +1171,12 @@  void __init smp_prepare_cpus(void)
     cpu_data[0].stack_base = (void *)
              ((unsigned long)stack_start & ~(STACK_SIZE - 1));
 
+    /* Set state as CALLOUT so APs won't change it in initialize_cpu_data() */
+    boot_cpu_data.cpu_state = CPU_STATE_CALLOUT;
+
+    /* Not really used anywhere, but set it just in case. */
+    set_cpu_state(0, CPU_STATE_ONLINE);
+
     set_nr_sockets();
 
     socket_cpumask = xzalloc_array(cpumask_t *, nr_sockets);
@@ -1267,7 +1283,7 @@  void __cpu_disable(void)
 {
     int cpu = smp_processor_id();
 
-    set_cpu_state(CPU_STATE_DYING);
+    set_cpu_state(cpu, CPU_STATE_DYING);
 
     local_irq_disable();
     clear_local_APIC();
@@ -1292,7 +1308,7 @@  void __cpu_die(unsigned int cpu)
     unsigned int i = 0;
     enum cpu_state seen_state;
 
-    while ( (seen_state = cpu_state) != CPU_STATE_DEAD )
+    while ( (seen_state = cpu_data[cpu].cpu_state) != CPU_STATE_DEAD )
     {
         BUG_ON(seen_state != CPU_STATE_DYING);
         mdelay(100);
@@ -1393,7 +1409,7 @@  int __cpu_up(unsigned int cpu)
 
     time_latch_stamps();
 
-    set_cpu_state(CPU_STATE_ONLINE);
+    set_cpu_state(cpu, CPU_STATE_ONLINE);
     while ( !cpu_online(cpu) )
     {
         cpu_relax();