diff mbox series

[v6,01/11] x86/apic/x2apic: Fix parallel handling of cluster_mask

Message ID 20230202215625.3248306-2-usama.arif@bytedance.com (mailing list archive)
State Superseded
Headers show
Series Parallel CPU bringup for x86_64 | expand

Commit Message

Usama Arif Feb. 2, 2023, 9:56 p.m. UTC
From: David Woodhouse <dwmw@amazon.co.uk>

For each CPU being brought up, the alloc_clustermask() function
allocates a new struct cluster_mask just in case it's needed. Then the
target CPU actually runs, and in init_x2apic_ldr() it either uses a
cluster_mask from a previous CPU in the same cluster, or consumes the
"spare" one and sets the global pointer to NULL.

That isn't going to parallelise stunningly well.

Ditch the global variable, let alloc_clustermask() install the struct
*directly* in the per_cpu data for the CPU being brought up. As an
optimisation, actually make it do so for *all* present CPUs in the same
cluster, which means only one iteration over for_each_present_cpu()
instead of doing so repeatedly, once for each CPU.

Now, in fact, there's no point in the 'node' or 'clusterid' members of
the struct cluster_mask, so just kill it and use struct cpumask instead.

This was a harmless "bug" while CPU bringup wasn't actually happening in
parallel. It's about to become less harmless...

Fixes: 023a611748fd5 ("x86/apic/x2apic: Simplify cluster management")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 arch/x86/kernel/apic/x2apic_cluster.c | 108 +++++++++++++++-----------
 1 file changed, 62 insertions(+), 46 deletions(-)

Comments

Thomas Gleixner Feb. 6, 2023, 11:20 p.m. UTC | #1
Usama!

On Thu, Feb 02 2023 at 21:56, Usama Arif wrote:
> For each CPU being brought up, the alloc_clustermask() function
> allocates a new struct cluster_mask just in case it's needed. Then the
> target CPU actually runs, and in init_x2apic_ldr() it either uses a
> cluster_mask from a previous CPU in the same cluster, or consumes the
> "spare" one and sets the global pointer to NULL.
>
> That isn't going to parallelise stunningly well.
>
> Ditch the global variable, let alloc_clustermask() install the struct
> *directly* in the per_cpu data for the CPU being brought up. As an
> optimisation, actually make it do so for *all* present CPUs in the same
> cluster, which means only one iteration over for_each_present_cpu()
> instead of doing so repeatedly, once for each CPU.
>
> Now, in fact, there's no point in the 'node' or 'clusterid' members of
> the struct cluster_mask, so just kill it and use struct cpumask instead.
>
> This was a harmless "bug" while CPU bringup wasn't actually happening in
> parallel. It's about to become less harmless...

Just to be clear. There is no bug in todays code and therefore this:

> Fixes: 023a611748fd5 ("x86/apic/x2apic: Simplify cluster management")

tag is unjustified. It'll just cause the stable robots to backport it
for no reason.

> Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>

How is this SOB chain correct? It's unclear to me how Paul got involved
here, but let's assume he handed the patch over to you, then this still
lacks a SOB from you.

> +/*
> + * As an optimisation during boot, set the cluster_mask for *all*
> + * present CPUs at once, to prevent *each* of them having to iterate
> + * over the others to find the existing cluster_mask.
> + */
> +static void prefill_clustermask(struct cpumask *cmsk, u32 cluster)
> +{
> +	int cpu;
> +
> +	for_each_present_cpu(cpu) {
> +		u32 apicid = apic->cpu_present_to_apicid(cpu);

Lacks a newline between declaration and code.

> +		if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) {
> +			struct cpumask **cpu_cmsk = &per_cpu(cluster_masks, cpu);
> +
> +			BUG_ON(*cpu_cmsk && *cpu_cmsk != cmsk);

While I agree that changing an in use mask pointer would be fatal, I
really have to ask why this code would be invoked on a partially
initialized cluster at all and why that would be correct.

                        if (WARN_ON_ONCE(*cpu_cmsk == cmsk))
                        	return;
                        BUG_ON(*cpu_mask);

if at all. But of course that falls over with the way how this code is
invoked below.

> +			*cpu_cmsk = cmsk;
> +		}
>  
> -static int alloc_clustermask(unsigned int cpu, int node)
> +static int alloc_clustermask(unsigned int cpu, u32 cluster, int node)
>  {
> +	struct cpumask *cmsk = NULL;
> +	unsigned int cpu_i;
> +	u32 apicid;
> +
>  	if (per_cpu(cluster_masks, cpu))
>  		return 0;
> -	/*
> -	 * If a hotplug spare mask exists, check whether it's on the right
> -	 * node. If not, free it and allocate a new one.
> -	 */
> -	if (cluster_hotplug_mask) {
> -		if (cluster_hotplug_mask->node == node)
> -			return 0;
> -		kfree(cluster_hotplug_mask);
> +
> +	/* For the hotplug case, don't always allocate a new one */

-ENOPARSE

> +	if (system_state >= SYSTEM_RUNNING) {
> +		for_each_present_cpu(cpu_i) {
> +			apicid = apic->cpu_present_to_apicid(cpu_i);
> +			if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) {
> +				cmsk = per_cpu(cluster_masks, cpu_i);
> +				if (cmsk)
> +					break;
> +			}
> +		}
> +	}
> +	if (!cmsk) {
> +		cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node);
> +		if (!cmsk)
> +			return -ENOMEM;
>  	}

...
  
> +	per_cpu(cluster_masks, cpu) = cmsk;
> +
> +	if (system_state < SYSTEM_RUNNING)
> +		prefill_clustermask(cmsk, cluster);

TBH. The logic of this code is anything but obvious. Something like the
uncompiled below perhaps?

Thanks,

        tglx
---

@@ -116,44 +109,90 @@
+
+/*
+ * As an optimisation during boot, set the cluster_mask for all present
+ * CPUs at once, to prevent each of them having to iterate over the others
+ * to find the existing cluster_mask.
+ */
+static void prefill_clustermask(struct cpumask *cmsk, unsigned int cpu, u32 cluster)
+{
+	int cpu_i;
 
-	cluster = apicid >> 16;
-	for_each_online_cpu(cpu) {
-		cmsk = per_cpu(cluster_masks, cpu);
-		/* Matching cluster found. Link and update it. */
-		if (cmsk && cmsk->clusterid == cluster)
-			goto update;
+	for_each_present_cpu(cpu_i) {
+		struct cpumask **cpu_cmsk = &per_cpu(cluster_masks, cpu);
+		u32 apicid = apic->cpu_present_to_apicid(cpu_i);
+
+		if (apicid == BAD_APICID || cpu_i == cpu || apic_cluster(apicid) != cluster)
+			continue;
+
+		if (WARN_ON_ONCE(*cpu_mask == cmsk))
+			continue;
+
+		BUG_ON(*cpu_cmsk);
+		*cpu_cmsk = cmsk;
 	}
-	cmsk = cluster_hotplug_mask;
-	cmsk->clusterid = cluster;
-	cluster_hotplug_mask = NULL;
-update:
-	this_cpu_write(cluster_masks, cmsk);
-	cpumask_set_cpu(smp_processor_id(), &cmsk->mask);
 }
 
-static int alloc_clustermask(unsigned int cpu, int node)
+static int alloc_clustermask(unsigned int cpu, u32 cluster, int node)
 {
+	struct cpumask *cmsk;
+	unsigned int cpu_i;
+
 	if (per_cpu(cluster_masks, cpu))
 		return 0;
+
 	/*
-	 * If a hotplug spare mask exists, check whether it's on the right
-	 * node. If not, free it and allocate a new one.
+	 * At boot time CPU present mask is stable. If the cluster is not
+	 * yet initialized, allocate the mask and propagate it to all
+	 * siblings in this cluster.
 	 */
-	if (cluster_hotplug_mask) {
-		if (cluster_hotplug_mask->node == node)
-			return 0;
-		kfree(cluster_hotplug_mask);
-	}
+	if (system_state < SYSTEM_RUNNING)
+		goto alloc;
+
+	/*
+	 * On post boot hotplug iterate over the present CPUs to handle the
+	 * case of partial clusters as they might be presented by
+	 * virtualization.
+	 */
+	for_each_present_cpu(cpu_i) {
+		u32 apicid = apic->cpu_present_to_apicid(cpu_i);
+
+		if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) {
+			cmsk = per_cpu(cluster_masks, cpu_i);
 
-	cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask),
-					    GFP_KERNEL, node);
-	if (!cluster_hotplug_mask)
+			/*
+			 * If the cluster is already initialized, just
+			 * store the mask and return. No point in trying to
+			 * propagate.
+			 */
+			if (cmsk) {
+				per_cpu(cluster_masks, cpu) = cmsk;
+				return 0;
+			}
+		}
+	}
+	/*
+	 * The cluster is not initialized yet. Fall through to the boot
+	 * time code which might initialize the whole cluster if it is
+	 * in the CPU present mask.
+	 */
+alloc:
+	cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node);
+	if (!cmsk)
 		return -ENOMEM;
-	cluster_hotplug_mask->node = node;
+	per_cpu(cluster_masks, cpu) = cmsk;
+	prefill_clustermask(cmsk, cluster);
+
 	return 0;
 }
David Woodhouse Feb. 7, 2023, 10:57 a.m. UTC | #2
On Tue, 2023-02-07 at 00:20 +0100, Thomas Gleixner wrote:
> 
> 
> TBH. The logic of this code is anything but obvious. Something like the
> uncompiled below perhaps?

Looks sane to me. I'll tweak the comments a bit and give it a spin;
thanks.

...

> +        * At boot time CPU present mask is stable. If the cluster is not
> +        * yet initialized, allocate the mask and propagate it to all
> +        * siblings in this cluster.
>          */
> -       if (cluster_hotplug_mask) {
> -               if (cluster_hotplug_mask->node == node)
> -                       return 0;
> -               kfree(cluster_hotplug_mask);
> -       }
> +       if (system_state < SYSTEM_RUNNING)
> +               goto alloc;
> +
> +       /*
> +        * On post boot hotplug iterate over the present CPUs to handle the
> +        * case of partial clusters as they might be presented by
> +        * virtualization.
> +        */
> +       for_each_present_cpu(cpu_i) {


So... if this CPU was *present* at boot time (and if any other CPU in
this cluster was present), it will already have a cluster_mask.

Which means we get here in two cases: 

 • This CPU wasn't actually present (was just 'possible') at boot time.
   (Is that actually a thing that happens?)

 • This CPU was present but no other CPU in this cluster was actually
   brought up at boot time so the cluster_mask wasn't allocated.

The code looks right, I don't grok the comment about partial clusters
and virtualization, and would have worded it something along the above
lines?
David Woodhouse Feb. 7, 2023, 11:27 a.m. UTC | #3
On Tue, 2023-02-07 at 10:57 +0000, David Woodhouse wrote:
> 
> > +       /*
> > +        * On post boot hotplug iterate over the present CPUs to handle the
> > +        * case of partial clusters as they might be presented by
> > +        * virtualization.
> > +        */
> > +       for_each_present_cpu(cpu_i) {
> 
> 
> So... if this CPU was *present* at boot time (and if any other CPU in
> this cluster was present), it will already have a cluster_mask.
> 
> Which means we get here in two cases: 
> 
>  • This CPU wasn't actually present (was just 'possible') at boot time.
>    (Is that actually a thing that happens?)
> 
>  • This CPU was present but no other CPU in this cluster was actually
>    brought up at boot time so the cluster_mask wasn't allocated.
> 
> The code looks right, I don't grok the comment about partial clusters
> and virtualization, and would have worded it something along the above
> lines?

As I get my head around that, I think the code needs to change too.
What if we *unplug* the only CPU in a cluster (present→possible), then
add a new one in the same cluster? The new one would get a new
cluster_mask. Which is kind of OK for now but then if we re-add the
original CPU it'd continue to use its old cluster_mask.

Now, that's kind of weird if it's physical CPUs because that cluster is
within a given chip, isn't it? But with virtualization maybe that's
something that could happen, and it doesn't hurt to be completely safe
by using for_each_possible_cpu() instead?

Now looks like this:


	/*
	 * On post boot hotplug for a CPU which was not present at boot time,
	 * iterate over all possible CPUs (even those which are not present
	 * any more) to find any existing cluster mask.
	 */
	for_each_possible_cpu(cpu_i) {
Thomas Gleixner Feb. 7, 2023, 2:22 p.m. UTC | #4
David!

On Tue, Feb 07 2023 at 10:57, David Woodhouse wrote:
> On Tue, 2023-02-07 at 00:20 +0100, Thomas Gleixner wrote:
>> +       /*
>> +        * On post boot hotplug iterate over the present CPUs to handle the
>> +        * case of partial clusters as they might be presented by
>> +        * virtualization.
>> +        */
>> +       for_each_present_cpu(cpu_i) {
>
>
> So... if this CPU was *present* at boot time (and if any other CPU in
> this cluster was present), it will already have a cluster_mask.
>
> Which means we get here in two cases: 
>
>  • This CPU wasn't actually present (was just 'possible') at boot time.
>    (Is that actually a thing that happens?)

It happens on systems which support physical hotplug and AFAIK also
virtualization has support for "physical" hotplug.

The same is true the other way round on phsyical unplug. Then the CPU
is removed from present but is still set in possible.

>  • This CPU was present but no other CPU in this cluster was actually
>    brought up at boot time so the cluster_mask wasn't allocated.

Correct.

> The code looks right, I don't grok the comment about partial clusters
> and virtualization, and would have worded it something along the above
> lines?

My worry was that virtualization might be able to phsyically hotplug
partial clusters. Whether that's possible I don't know, but in context
of virtualization I always assume the worst case.

Thanks,

        tglx
Thomas Gleixner Feb. 7, 2023, 2:24 p.m. UTC | #5
On Tue, Feb 07 2023 at 11:27, David Woodhouse wrote:
> On Tue, 2023-02-07 at 10:57 +0000, David Woodhouse wrote:
>>  • This CPU was present but no other CPU in this cluster was actually
>>    brought up at boot time so the cluster_mask wasn't allocated.
>> 
>> The code looks right, I don't grok the comment about partial clusters
>> and virtualization, and would have worded it something along the above
>> lines?
>
> As I get my head around that, I think the code needs to change too.
> What if we *unplug* the only CPU in a cluster (present→possible), then
> add a new one in the same cluster? The new one would get a new
> cluster_mask. Which is kind of OK for now but then if we re-add the
> original CPU it'd continue to use its old cluster_mask.

Indeed.

> Now, that's kind of weird if it's physical CPUs because that cluster is
> within a given chip, isn't it? But with virtualization maybe that's
> something that could happen, and it doesn't hurt to be completely safe
> by using for_each_possible_cpu() instead?

Yes. Virtualization does aweful things....

> Now looks like this:
> 	/*
> 	 * On post boot hotplug for a CPU which was not present at boot time,
> 	 * iterate over all possible CPUs (even those which are not present
> 	 * any more) to find any existing cluster mask.
> 	 */
> 	for_each_possible_cpu(cpu_i) {

Looks good!

      tglx
David Woodhouse Feb. 7, 2023, 7:53 p.m. UTC | #6
On Tue, 2023-02-07 at 15:24 +0100, Thomas Gleixner wrote:
> On Tue, Feb 07 2023 at 11:27, David Woodhouse wrote:
> > On Tue, 2023-02-07 at 10:57 +0000, David Woodhouse wrote:
> > >  • This CPU was present but no other CPU in this cluster was actually
> > >    brought up at boot time so the cluster_mask wasn't allocated.
> > > 
> > > The code looks right, I don't grok the comment about partial clusters
> > > and virtualization, and would have worded it something along the above
> > > lines?
> > 
> > As I get my head around that, I think the code needs to change too.
> > What if we *unplug* the only CPU in a cluster (present→possible), then
> > add a new one in the same cluster? The new one would get a new
> > cluster_mask. Which is kind of OK for now but then if we re-add the
> > original CPU it'd continue to use its old cluster_mask.
> 
> Indeed.
> 
> > Now, that's kind of weird if it's physical CPUs because that cluster is
> > within a given chip, isn't it? But with virtualization maybe that's
> > something that could happen, and it doesn't hurt to be completely safe
> > by using for_each_possible_cpu() instead?
> 
> Yes. Virtualization does aweful things....
> 
> > Now looks like this:
> >         /*
> >          * On post boot hotplug for a CPU which was not present at boot time,
> >          * iterate over all possible CPUs (even those which are not present
> >          * any more) to find any existing cluster mask.
> >          */
> >         for_each_possible_cpu(cpu_i) {
> 
> Looks good!


Thanks. I've reworked and I think I've caught everything. Didn't want
to elide the credit where Usama had done some of the forward-porting
work, so I've left those notes and the SoB intact on those patches, on
the assumption that they will be reposting the series after proper
testing on hardware again anyway (I'm only spawning it in qemu right
now).

https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/parallel-6.2-rc7

The only real code change other than what we've discussed here is to
implement what we talked about for CPUID 0xb vs. 0x1 etc:


	/*
	 * We can do 64-bit AP bringup in parallel if the CPU reports
	 * its APIC ID in CPUID (either leaf 0x0B if we need the full
	 * APIC ID in X2APIC mode, or leaf 0x01 if 8 bits are
	 * sufficient). Otherwise it's too hard. And not for SEV-ES
	 * guests because they can't use CPUID that early.
	 */
	if (IS_ENABLED(CONFIG_X86_32) || boot_cpu_data.cpuid_level < 1 ||
	    (x2apic_mode && boot_cpu_data.cpuid_level < 0xb) ||
	    cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
		do_parallel_bringup = false;

	if (do_parallel_bringup && x2apic_mode) {
		unsigned int eax, ebx, ecx, edx;

		/*
		 * To support parallel bringup in x2apic mode, the AP will need
		 * to obtain its APIC ID from CPUID 0x0B, since CPUID 0x01 has
		 * only 8 bits. Check that it is present and seems correct.
		 */
		cpuid_count(0xb, 0, &eax, &ebx, &ecx, &edx);

		/*
		 * AMD says that if executed with an umimplemented level in
		 * ECX, then it will return all zeroes in EAX. Intel says it
		 * will return zeroes in both EAX and EBX. Checking only EAX
		 * should be sufficient.
		 */
		if (eax) {
			smpboot_control = STARTUP_SECONDARY | STARTUP_APICID_CPUID_0B;
		} else {
			pr_info("Disabling parallel bringup because CPUID 0xb looks untrustworthy\n");
			do_parallel_bringup = false;
		}
	} else if (do_parallel_bringup) {
		/* Without X2APIC, what's in CPUID 0x01 should suffice. */
		smpboot_control = STARTUP_SECONDARY | STARTUP_APICID_CPUID_01;
	}
Thomas Gleixner Feb. 7, 2023, 8:58 p.m. UTC | #7
On Tue, Feb 07 2023 at 19:53, David Woodhouse wrote:
> On Tue, 2023-02-07 at 15:24 +0100, Thomas Gleixner wrote:
> Thanks. I've reworked and I think I've caught everything. Didn't want
> to elide the credit where Usama had done some of the forward-porting
> work, so I've left those notes and the SoB intact on those patches, on
> the assumption that they will be reposting the series after proper
> testing on hardware again anyway (I'm only spawning it in qemu right
> now).
>
> https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/parallel-6.2-rc7
>
> The only real code change other than what we've discussed here is to
> implement what we talked about for CPUID 0xb vs. 0x1 etc:
>
> 	/*
> 	 * We can do 64-bit AP bringup in parallel if the CPU reports
> 	 * its APIC ID in CPUID (either leaf 0x0B if we need the full
> 	 * APIC ID in X2APIC mode, or leaf 0x01 if 8 bits are
> 	 * sufficient). Otherwise it's too hard. And not for SEV-ES
> 	 * guests because they can't use CPUID that early.
> 	 */
> 	if (IS_ENABLED(CONFIG_X86_32) || boot_cpu_data.cpuid_level < 1 ||
> 	    (x2apic_mode && boot_cpu_data.cpuid_level < 0xb) ||
> 	    cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
> 		do_parallel_bringup = false;
>
> 	if (do_parallel_bringup && x2apic_mode) {
> 		unsigned int eax, ebx, ecx, edx;
>
> 		/*
> 		 * To support parallel bringup in x2apic mode, the AP will need
> 		 * to obtain its APIC ID from CPUID 0x0B, since CPUID 0x01 has
> 		 * only 8 bits. Check that it is present and seems correct.
> 		 */
> 		cpuid_count(0xb, 0, &eax, &ebx, &ecx, &edx);
>
> 		/*
> 		 * AMD says that if executed with an umimplemented level in
> 		 * ECX, then it will return all zeroes in EAX. Intel says it
> 		 * will return zeroes in both EAX and EBX. Checking only EAX
> 		 * should be sufficient.
> 		 */
> 		if (eax) {
> 			smpboot_control = STARTUP_SECONDARY | STARTUP_APICID_CPUID_0B;
> 		} else {
> 			pr_info("Disabling parallel bringup because CPUID 0xb looks untrustworthy\n");
> 			do_parallel_bringup = false;
> 		}
> 	} else if (do_parallel_bringup) {
> 		/* Without X2APIC, what's in CPUID 0x01 should suffice. */
> 		smpboot_control = STARTUP_SECONDARY | STARTUP_APICID_CPUID_01;
> 	}

Looks good to me!

Thanks,

        tglx
diff mbox series

Patch

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index e696e22d0531..e116dfaf5922 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -9,11 +9,7 @@ 
 
 #include "local.h"
 
-struct cluster_mask {
-	unsigned int	clusterid;
-	int		node;
-	struct cpumask	mask;
-};
+#define apic_cluster(apicid) ((apicid) >> 4)
 
 /*
  * __x2apic_send_IPI_mask() possibly needs to read
@@ -23,8 +19,7 @@  struct cluster_mask {
 static u32 *x86_cpu_to_logical_apicid __read_mostly;
 
 static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
-static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks);
-static struct cluster_mask *cluster_hotplug_mask;
+static DEFINE_PER_CPU_READ_MOSTLY(struct cpumask *, cluster_masks);
 
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
@@ -60,10 +55,10 @@  __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 
 	/* Collapse cpus in a cluster so a single IPI per cluster is sent */
 	for_each_cpu(cpu, tmpmsk) {
-		struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu);
+		struct cpumask *cmsk = per_cpu(cluster_masks, cpu);
 
 		dest = 0;
-		for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask)
+		for_each_cpu_and(clustercpu, tmpmsk, cmsk)
 			dest |= x86_cpu_to_logical_apicid[clustercpu];
 
 		if (!dest)
@@ -71,7 +66,7 @@  __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 
 		__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
 		/* Remove cluster CPUs from tmpmask */
-		cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask);
+		cpumask_andnot(tmpmsk, tmpmsk, cmsk);
 	}
 
 	local_irq_restore(flags);
@@ -105,55 +100,76 @@  static u32 x2apic_calc_apicid(unsigned int cpu)
 
 static void init_x2apic_ldr(void)
 {
-	struct cluster_mask *cmsk = this_cpu_read(cluster_masks);
-	u32 cluster, apicid = apic_read(APIC_LDR);
-	unsigned int cpu;
+	struct cpumask *cmsk = this_cpu_read(cluster_masks);
 
-	x86_cpu_to_logical_apicid[smp_processor_id()] = apicid;
+	BUG_ON(!cmsk);
 
-	if (cmsk)
-		goto update;
-
-	cluster = apicid >> 16;
-	for_each_online_cpu(cpu) {
-		cmsk = per_cpu(cluster_masks, cpu);
-		/* Matching cluster found. Link and update it. */
-		if (cmsk && cmsk->clusterid == cluster)
-			goto update;
+	cpumask_set_cpu(smp_processor_id(), cmsk);
+}
+
+/*
+ * As an optimisation during boot, set the cluster_mask for *all*
+ * present CPUs at once, to prevent *each* of them having to iterate
+ * over the others to find the existing cluster_mask.
+ */
+static void prefill_clustermask(struct cpumask *cmsk, u32 cluster)
+{
+	int cpu;
+
+	for_each_present_cpu(cpu) {
+		u32 apicid = apic->cpu_present_to_apicid(cpu);
+		if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) {
+			struct cpumask **cpu_cmsk = &per_cpu(cluster_masks, cpu);
+
+			BUG_ON(*cpu_cmsk && *cpu_cmsk != cmsk);
+			*cpu_cmsk = cmsk;
+		}
 	}
-	cmsk = cluster_hotplug_mask;
-	cmsk->clusterid = cluster;
-	cluster_hotplug_mask = NULL;
-update:
-	this_cpu_write(cluster_masks, cmsk);
-	cpumask_set_cpu(smp_processor_id(), &cmsk->mask);
 }
 
-static int alloc_clustermask(unsigned int cpu, int node)
+static int alloc_clustermask(unsigned int cpu, u32 cluster, int node)
 {
+	struct cpumask *cmsk = NULL;
+	unsigned int cpu_i;
+	u32 apicid;
+
 	if (per_cpu(cluster_masks, cpu))
 		return 0;
-	/*
-	 * If a hotplug spare mask exists, check whether it's on the right
-	 * node. If not, free it and allocate a new one.
-	 */
-	if (cluster_hotplug_mask) {
-		if (cluster_hotplug_mask->node == node)
-			return 0;
-		kfree(cluster_hotplug_mask);
+
+	/* For the hotplug case, don't always allocate a new one */
+	if (system_state >= SYSTEM_RUNNING) {
+		for_each_present_cpu(cpu_i) {
+			apicid = apic->cpu_present_to_apicid(cpu_i);
+			if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) {
+				cmsk = per_cpu(cluster_masks, cpu_i);
+				if (cmsk)
+					break;
+			}
+		}
+	}
+	if (!cmsk) {
+		cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node);
+		if (!cmsk)
+			return -ENOMEM;
 	}
 
-	cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask),
-					    GFP_KERNEL, node);
-	if (!cluster_hotplug_mask)
-		return -ENOMEM;
-	cluster_hotplug_mask->node = node;
+	per_cpu(cluster_masks, cpu) = cmsk;
+
+	if (system_state < SYSTEM_RUNNING)
+		prefill_clustermask(cmsk, cluster);
+
 	return 0;
 }
 
 static int x2apic_prepare_cpu(unsigned int cpu)
 {
-	if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0)
+	u32 phys_apicid = apic->cpu_present_to_apicid(cpu);
+	u32 cluster = apic_cluster(phys_apicid);
+	u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf));
+
+	x86_cpu_to_logical_apicid[cpu] = logical_apicid;
+
+	if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0)
 		return -ENOMEM;
 	if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL))
 		return -ENOMEM;
@@ -162,10 +178,10 @@  static int x2apic_prepare_cpu(unsigned int cpu)
 
 static int x2apic_dead_cpu(unsigned int dead_cpu)
 {
-	struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu);
+	struct cpumask *cmsk = per_cpu(cluster_masks, dead_cpu);
 
 	if (cmsk)
-		cpumask_clear_cpu(dead_cpu, &cmsk->mask);
+		cpumask_clear_cpu(dead_cpu, cmsk);
 	free_cpumask_var(per_cpu(ipi_mask, dead_cpu));
 	return 0;
 }