Message ID | 20230202215625.3248306-2-usama.arif@bytedance.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | Parallel CPU bringup for x86_64 | expand |
Usama! On Thu, Feb 02 2023 at 21:56, Usama Arif wrote: > For each CPU being brought up, the alloc_clustermask() function > allocates a new struct cluster_mask just in case it's needed. Then the > target CPU actually runs, and in init_x2apic_ldr() it either uses a > cluster_mask from a previous CPU in the same cluster, or consumes the > "spare" one and sets the global pointer to NULL. > > That isn't going to parallelise stunningly well. > > Ditch the global variable, let alloc_clustermask() install the struct > *directly* in the per_cpu data for the CPU being brought up. As an > optimisation, actually make it do so for *all* present CPUs in the same > cluster, which means only one iteration over for_each_present_cpu() > instead of doing so repeatedly, once for each CPU. > > Now, in fact, there's no point in the 'node' or 'clusterid' members of > the struct cluster_mask, so just kill it and use struct cpumask instead. > > This was a harmless "bug" while CPU bringup wasn't actually happening in > parallel. It's about to become less harmless... Just to be clear. There is no bug in todays code and therefore this: > Fixes: 023a611748fd5 ("x86/apic/x2apic: Simplify cluster management") tag is unjustified. It'll just cause the stable robots to backport it for no reason. > Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> > Signed-off-by: Paul E. McKenney <paulmck@kernel.org> How is this SOB chain correct? It's unclear to me how Paul got involved here, but let's assume he handed the patch over to you, then this still lacks a SOB from you. > +/* > + * As an optimisation during boot, set the cluster_mask for *all* > + * present CPUs at once, to prevent *each* of them having to iterate > + * over the others to find the existing cluster_mask. > + */ > +static void prefill_clustermask(struct cpumask *cmsk, u32 cluster) > +{ > + int cpu; > + > + for_each_present_cpu(cpu) { > + u32 apicid = apic->cpu_present_to_apicid(cpu); Lacks a newline between declaration and code. > + if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) { > + struct cpumask **cpu_cmsk = &per_cpu(cluster_masks, cpu); > + > + BUG_ON(*cpu_cmsk && *cpu_cmsk != cmsk); While I agree that changing an in use mask pointer would be fatal, I really have to ask why this code would be invoked on a partially initialized cluster at all and why that would be correct. if (WARN_ON_ONCE(*cpu_cmsk == cmsk)) return; BUG_ON(*cpu_mask); if at all. But of course that falls over with the way how this code is invoked below. > + *cpu_cmsk = cmsk; > + } > > -static int alloc_clustermask(unsigned int cpu, int node) > +static int alloc_clustermask(unsigned int cpu, u32 cluster, int node) > { > + struct cpumask *cmsk = NULL; > + unsigned int cpu_i; > + u32 apicid; > + > if (per_cpu(cluster_masks, cpu)) > return 0; > - /* > - * If a hotplug spare mask exists, check whether it's on the right > - * node. If not, free it and allocate a new one. > - */ > - if (cluster_hotplug_mask) { > - if (cluster_hotplug_mask->node == node) > - return 0; > - kfree(cluster_hotplug_mask); > + > + /* For the hotplug case, don't always allocate a new one */ -ENOPARSE > + if (system_state >= SYSTEM_RUNNING) { > + for_each_present_cpu(cpu_i) { > + apicid = apic->cpu_present_to_apicid(cpu_i); > + if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) { > + cmsk = per_cpu(cluster_masks, cpu_i); > + if (cmsk) > + break; > + } > + } > + } > + if (!cmsk) { > + cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node); > + if (!cmsk) > + return -ENOMEM; > } ... > + per_cpu(cluster_masks, cpu) = cmsk; > + > + if (system_state < SYSTEM_RUNNING) > + prefill_clustermask(cmsk, cluster); TBH. The logic of this code is anything but obvious. Something like the uncompiled below perhaps? Thanks, tglx --- @@ -116,44 +109,90 @@ + +/* + * As an optimisation during boot, set the cluster_mask for all present + * CPUs at once, to prevent each of them having to iterate over the others + * to find the existing cluster_mask. + */ +static void prefill_clustermask(struct cpumask *cmsk, unsigned int cpu, u32 cluster) +{ + int cpu_i; - cluster = apicid >> 16; - for_each_online_cpu(cpu) { - cmsk = per_cpu(cluster_masks, cpu); - /* Matching cluster found. Link and update it. */ - if (cmsk && cmsk->clusterid == cluster) - goto update; + for_each_present_cpu(cpu_i) { + struct cpumask **cpu_cmsk = &per_cpu(cluster_masks, cpu); + u32 apicid = apic->cpu_present_to_apicid(cpu_i); + + if (apicid == BAD_APICID || cpu_i == cpu || apic_cluster(apicid) != cluster) + continue; + + if (WARN_ON_ONCE(*cpu_mask == cmsk)) + continue; + + BUG_ON(*cpu_cmsk); + *cpu_cmsk = cmsk; } - cmsk = cluster_hotplug_mask; - cmsk->clusterid = cluster; - cluster_hotplug_mask = NULL; -update: - this_cpu_write(cluster_masks, cmsk); - cpumask_set_cpu(smp_processor_id(), &cmsk->mask); } -static int alloc_clustermask(unsigned int cpu, int node) +static int alloc_clustermask(unsigned int cpu, u32 cluster, int node) { + struct cpumask *cmsk; + unsigned int cpu_i; + if (per_cpu(cluster_masks, cpu)) return 0; + /* - * If a hotplug spare mask exists, check whether it's on the right - * node. If not, free it and allocate a new one. + * At boot time CPU present mask is stable. If the cluster is not + * yet initialized, allocate the mask and propagate it to all + * siblings in this cluster. */ - if (cluster_hotplug_mask) { - if (cluster_hotplug_mask->node == node) - return 0; - kfree(cluster_hotplug_mask); - } + if (system_state < SYSTEM_RUNNING) + goto alloc; + + /* + * On post boot hotplug iterate over the present CPUs to handle the + * case of partial clusters as they might be presented by + * virtualization. + */ + for_each_present_cpu(cpu_i) { + u32 apicid = apic->cpu_present_to_apicid(cpu_i); + + if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) { + cmsk = per_cpu(cluster_masks, cpu_i); - cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask), - GFP_KERNEL, node); - if (!cluster_hotplug_mask) + /* + * If the cluster is already initialized, just + * store the mask and return. No point in trying to + * propagate. + */ + if (cmsk) { + per_cpu(cluster_masks, cpu) = cmsk; + return 0; + } + } + } + /* + * The cluster is not initialized yet. Fall through to the boot + * time code which might initialize the whole cluster if it is + * in the CPU present mask. + */ +alloc: + cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node); + if (!cmsk) return -ENOMEM; - cluster_hotplug_mask->node = node; + per_cpu(cluster_masks, cpu) = cmsk; + prefill_clustermask(cmsk, cluster); + return 0; }
On Tue, 2023-02-07 at 00:20 +0100, Thomas Gleixner wrote: > > > TBH. The logic of this code is anything but obvious. Something like the > uncompiled below perhaps? Looks sane to me. I'll tweak the comments a bit and give it a spin; thanks. ... > + * At boot time CPU present mask is stable. If the cluster is not > + * yet initialized, allocate the mask and propagate it to all > + * siblings in this cluster. > */ > - if (cluster_hotplug_mask) { > - if (cluster_hotplug_mask->node == node) > - return 0; > - kfree(cluster_hotplug_mask); > - } > + if (system_state < SYSTEM_RUNNING) > + goto alloc; > + > + /* > + * On post boot hotplug iterate over the present CPUs to handle the > + * case of partial clusters as they might be presented by > + * virtualization. > + */ > + for_each_present_cpu(cpu_i) { So... if this CPU was *present* at boot time (and if any other CPU in this cluster was present), it will already have a cluster_mask. Which means we get here in two cases: • This CPU wasn't actually present (was just 'possible') at boot time. (Is that actually a thing that happens?) • This CPU was present but no other CPU in this cluster was actually brought up at boot time so the cluster_mask wasn't allocated. The code looks right, I don't grok the comment about partial clusters and virtualization, and would have worded it something along the above lines?
On Tue, 2023-02-07 at 10:57 +0000, David Woodhouse wrote: > > > + /* > > + * On post boot hotplug iterate over the present CPUs to handle the > > + * case of partial clusters as they might be presented by > > + * virtualization. > > + */ > > + for_each_present_cpu(cpu_i) { > > > So... if this CPU was *present* at boot time (and if any other CPU in > this cluster was present), it will already have a cluster_mask. > > Which means we get here in two cases: > > • This CPU wasn't actually present (was just 'possible') at boot time. > (Is that actually a thing that happens?) > > • This CPU was present but no other CPU in this cluster was actually > brought up at boot time so the cluster_mask wasn't allocated. > > The code looks right, I don't grok the comment about partial clusters > and virtualization, and would have worded it something along the above > lines? As I get my head around that, I think the code needs to change too. What if we *unplug* the only CPU in a cluster (present→possible), then add a new one in the same cluster? The new one would get a new cluster_mask. Which is kind of OK for now but then if we re-add the original CPU it'd continue to use its old cluster_mask. Now, that's kind of weird if it's physical CPUs because that cluster is within a given chip, isn't it? But with virtualization maybe that's something that could happen, and it doesn't hurt to be completely safe by using for_each_possible_cpu() instead? Now looks like this: /* * On post boot hotplug for a CPU which was not present at boot time, * iterate over all possible CPUs (even those which are not present * any more) to find any existing cluster mask. */ for_each_possible_cpu(cpu_i) {
David! On Tue, Feb 07 2023 at 10:57, David Woodhouse wrote: > On Tue, 2023-02-07 at 00:20 +0100, Thomas Gleixner wrote: >> + /* >> + * On post boot hotplug iterate over the present CPUs to handle the >> + * case of partial clusters as they might be presented by >> + * virtualization. >> + */ >> + for_each_present_cpu(cpu_i) { > > > So... if this CPU was *present* at boot time (and if any other CPU in > this cluster was present), it will already have a cluster_mask. > > Which means we get here in two cases: > > • This CPU wasn't actually present (was just 'possible') at boot time. > (Is that actually a thing that happens?) It happens on systems which support physical hotplug and AFAIK also virtualization has support for "physical" hotplug. The same is true the other way round on phsyical unplug. Then the CPU is removed from present but is still set in possible. > • This CPU was present but no other CPU in this cluster was actually > brought up at boot time so the cluster_mask wasn't allocated. Correct. > The code looks right, I don't grok the comment about partial clusters > and virtualization, and would have worded it something along the above > lines? My worry was that virtualization might be able to phsyically hotplug partial clusters. Whether that's possible I don't know, but in context of virtualization I always assume the worst case. Thanks, tglx
On Tue, Feb 07 2023 at 11:27, David Woodhouse wrote: > On Tue, 2023-02-07 at 10:57 +0000, David Woodhouse wrote: >> • This CPU was present but no other CPU in this cluster was actually >> brought up at boot time so the cluster_mask wasn't allocated. >> >> The code looks right, I don't grok the comment about partial clusters >> and virtualization, and would have worded it something along the above >> lines? > > As I get my head around that, I think the code needs to change too. > What if we *unplug* the only CPU in a cluster (present→possible), then > add a new one in the same cluster? The new one would get a new > cluster_mask. Which is kind of OK for now but then if we re-add the > original CPU it'd continue to use its old cluster_mask. Indeed. > Now, that's kind of weird if it's physical CPUs because that cluster is > within a given chip, isn't it? But with virtualization maybe that's > something that could happen, and it doesn't hurt to be completely safe > by using for_each_possible_cpu() instead? Yes. Virtualization does aweful things.... > Now looks like this: > /* > * On post boot hotplug for a CPU which was not present at boot time, > * iterate over all possible CPUs (even those which are not present > * any more) to find any existing cluster mask. > */ > for_each_possible_cpu(cpu_i) { Looks good! tglx
On Tue, 2023-02-07 at 15:24 +0100, Thomas Gleixner wrote: > On Tue, Feb 07 2023 at 11:27, David Woodhouse wrote: > > On Tue, 2023-02-07 at 10:57 +0000, David Woodhouse wrote: > > > • This CPU was present but no other CPU in this cluster was actually > > > brought up at boot time so the cluster_mask wasn't allocated. > > > > > > The code looks right, I don't grok the comment about partial clusters > > > and virtualization, and would have worded it something along the above > > > lines? > > > > As I get my head around that, I think the code needs to change too. > > What if we *unplug* the only CPU in a cluster (present→possible), then > > add a new one in the same cluster? The new one would get a new > > cluster_mask. Which is kind of OK for now but then if we re-add the > > original CPU it'd continue to use its old cluster_mask. > > Indeed. > > > Now, that's kind of weird if it's physical CPUs because that cluster is > > within a given chip, isn't it? But with virtualization maybe that's > > something that could happen, and it doesn't hurt to be completely safe > > by using for_each_possible_cpu() instead? > > Yes. Virtualization does aweful things.... > > > Now looks like this: > > /* > > * On post boot hotplug for a CPU which was not present at boot time, > > * iterate over all possible CPUs (even those which are not present > > * any more) to find any existing cluster mask. > > */ > > for_each_possible_cpu(cpu_i) { > > Looks good! Thanks. I've reworked and I think I've caught everything. Didn't want to elide the credit where Usama had done some of the forward-porting work, so I've left those notes and the SoB intact on those patches, on the assumption that they will be reposting the series after proper testing on hardware again anyway (I'm only spawning it in qemu right now). https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/parallel-6.2-rc7 The only real code change other than what we've discussed here is to implement what we talked about for CPUID 0xb vs. 0x1 etc: /* * We can do 64-bit AP bringup in parallel if the CPU reports * its APIC ID in CPUID (either leaf 0x0B if we need the full * APIC ID in X2APIC mode, or leaf 0x01 if 8 bits are * sufficient). Otherwise it's too hard. And not for SEV-ES * guests because they can't use CPUID that early. */ if (IS_ENABLED(CONFIG_X86_32) || boot_cpu_data.cpuid_level < 1 || (x2apic_mode && boot_cpu_data.cpuid_level < 0xb) || cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) do_parallel_bringup = false; if (do_parallel_bringup && x2apic_mode) { unsigned int eax, ebx, ecx, edx; /* * To support parallel bringup in x2apic mode, the AP will need * to obtain its APIC ID from CPUID 0x0B, since CPUID 0x01 has * only 8 bits. Check that it is present and seems correct. */ cpuid_count(0xb, 0, &eax, &ebx, &ecx, &edx); /* * AMD says that if executed with an umimplemented level in * ECX, then it will return all zeroes in EAX. Intel says it * will return zeroes in both EAX and EBX. Checking only EAX * should be sufficient. */ if (eax) { smpboot_control = STARTUP_SECONDARY | STARTUP_APICID_CPUID_0B; } else { pr_info("Disabling parallel bringup because CPUID 0xb looks untrustworthy\n"); do_parallel_bringup = false; } } else if (do_parallel_bringup) { /* Without X2APIC, what's in CPUID 0x01 should suffice. */ smpboot_control = STARTUP_SECONDARY | STARTUP_APICID_CPUID_01; }
On Tue, Feb 07 2023 at 19:53, David Woodhouse wrote: > On Tue, 2023-02-07 at 15:24 +0100, Thomas Gleixner wrote: > Thanks. I've reworked and I think I've caught everything. Didn't want > to elide the credit where Usama had done some of the forward-porting > work, so I've left those notes and the SoB intact on those patches, on > the assumption that they will be reposting the series after proper > testing on hardware again anyway (I'm only spawning it in qemu right > now). > > https://git.infradead.org/users/dwmw2/linux.git/shortlog/refs/heads/parallel-6.2-rc7 > > The only real code change other than what we've discussed here is to > implement what we talked about for CPUID 0xb vs. 0x1 etc: > > /* > * We can do 64-bit AP bringup in parallel if the CPU reports > * its APIC ID in CPUID (either leaf 0x0B if we need the full > * APIC ID in X2APIC mode, or leaf 0x01 if 8 bits are > * sufficient). Otherwise it's too hard. And not for SEV-ES > * guests because they can't use CPUID that early. > */ > if (IS_ENABLED(CONFIG_X86_32) || boot_cpu_data.cpuid_level < 1 || > (x2apic_mode && boot_cpu_data.cpuid_level < 0xb) || > cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) > do_parallel_bringup = false; > > if (do_parallel_bringup && x2apic_mode) { > unsigned int eax, ebx, ecx, edx; > > /* > * To support parallel bringup in x2apic mode, the AP will need > * to obtain its APIC ID from CPUID 0x0B, since CPUID 0x01 has > * only 8 bits. Check that it is present and seems correct. > */ > cpuid_count(0xb, 0, &eax, &ebx, &ecx, &edx); > > /* > * AMD says that if executed with an umimplemented level in > * ECX, then it will return all zeroes in EAX. Intel says it > * will return zeroes in both EAX and EBX. Checking only EAX > * should be sufficient. > */ > if (eax) { > smpboot_control = STARTUP_SECONDARY | STARTUP_APICID_CPUID_0B; > } else { > pr_info("Disabling parallel bringup because CPUID 0xb looks untrustworthy\n"); > do_parallel_bringup = false; > } > } else if (do_parallel_bringup) { > /* Without X2APIC, what's in CPUID 0x01 should suffice. */ > smpboot_control = STARTUP_SECONDARY | STARTUP_APICID_CPUID_01; > } Looks good to me! Thanks, tglx
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index e696e22d0531..e116dfaf5922 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -9,11 +9,7 @@ #include "local.h" -struct cluster_mask { - unsigned int clusterid; - int node; - struct cpumask mask; -}; +#define apic_cluster(apicid) ((apicid) >> 4) /* * __x2apic_send_IPI_mask() possibly needs to read @@ -23,8 +19,7 @@ struct cluster_mask { static u32 *x86_cpu_to_logical_apicid __read_mostly; static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); -static DEFINE_PER_CPU_READ_MOSTLY(struct cluster_mask *, cluster_masks); -static struct cluster_mask *cluster_hotplug_mask; +static DEFINE_PER_CPU_READ_MOSTLY(struct cpumask *, cluster_masks); static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { @@ -60,10 +55,10 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) /* Collapse cpus in a cluster so a single IPI per cluster is sent */ for_each_cpu(cpu, tmpmsk) { - struct cluster_mask *cmsk = per_cpu(cluster_masks, cpu); + struct cpumask *cmsk = per_cpu(cluster_masks, cpu); dest = 0; - for_each_cpu_and(clustercpu, tmpmsk, &cmsk->mask) + for_each_cpu_and(clustercpu, tmpmsk, cmsk) dest |= x86_cpu_to_logical_apicid[clustercpu]; if (!dest) @@ -71,7 +66,7 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) __x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL); /* Remove cluster CPUs from tmpmask */ - cpumask_andnot(tmpmsk, tmpmsk, &cmsk->mask); + cpumask_andnot(tmpmsk, tmpmsk, cmsk); } local_irq_restore(flags); @@ -105,55 +100,76 @@ static u32 x2apic_calc_apicid(unsigned int cpu) static void init_x2apic_ldr(void) { - struct cluster_mask *cmsk = this_cpu_read(cluster_masks); - u32 cluster, apicid = apic_read(APIC_LDR); - unsigned int cpu; + struct cpumask *cmsk = this_cpu_read(cluster_masks); - x86_cpu_to_logical_apicid[smp_processor_id()] = apicid; + BUG_ON(!cmsk); - if (cmsk) - goto update; - - cluster = apicid >> 16; - for_each_online_cpu(cpu) { - cmsk = per_cpu(cluster_masks, cpu); - /* Matching cluster found. Link and update it. */ - if (cmsk && cmsk->clusterid == cluster) - goto update; + cpumask_set_cpu(smp_processor_id(), cmsk); +} + +/* + * As an optimisation during boot, set the cluster_mask for *all* + * present CPUs at once, to prevent *each* of them having to iterate + * over the others to find the existing cluster_mask. + */ +static void prefill_clustermask(struct cpumask *cmsk, u32 cluster) +{ + int cpu; + + for_each_present_cpu(cpu) { + u32 apicid = apic->cpu_present_to_apicid(cpu); + if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) { + struct cpumask **cpu_cmsk = &per_cpu(cluster_masks, cpu); + + BUG_ON(*cpu_cmsk && *cpu_cmsk != cmsk); + *cpu_cmsk = cmsk; + } } - cmsk = cluster_hotplug_mask; - cmsk->clusterid = cluster; - cluster_hotplug_mask = NULL; -update: - this_cpu_write(cluster_masks, cmsk); - cpumask_set_cpu(smp_processor_id(), &cmsk->mask); } -static int alloc_clustermask(unsigned int cpu, int node) +static int alloc_clustermask(unsigned int cpu, u32 cluster, int node) { + struct cpumask *cmsk = NULL; + unsigned int cpu_i; + u32 apicid; + if (per_cpu(cluster_masks, cpu)) return 0; - /* - * If a hotplug spare mask exists, check whether it's on the right - * node. If not, free it and allocate a new one. - */ - if (cluster_hotplug_mask) { - if (cluster_hotplug_mask->node == node) - return 0; - kfree(cluster_hotplug_mask); + + /* For the hotplug case, don't always allocate a new one */ + if (system_state >= SYSTEM_RUNNING) { + for_each_present_cpu(cpu_i) { + apicid = apic->cpu_present_to_apicid(cpu_i); + if (apicid != BAD_APICID && apic_cluster(apicid) == cluster) { + cmsk = per_cpu(cluster_masks, cpu_i); + if (cmsk) + break; + } + } + } + if (!cmsk) { + cmsk = kzalloc_node(sizeof(*cmsk), GFP_KERNEL, node); + if (!cmsk) + return -ENOMEM; } - cluster_hotplug_mask = kzalloc_node(sizeof(*cluster_hotplug_mask), - GFP_KERNEL, node); - if (!cluster_hotplug_mask) - return -ENOMEM; - cluster_hotplug_mask->node = node; + per_cpu(cluster_masks, cpu) = cmsk; + + if (system_state < SYSTEM_RUNNING) + prefill_clustermask(cmsk, cluster); + return 0; } static int x2apic_prepare_cpu(unsigned int cpu) { - if (alloc_clustermask(cpu, cpu_to_node(cpu)) < 0) + u32 phys_apicid = apic->cpu_present_to_apicid(cpu); + u32 cluster = apic_cluster(phys_apicid); + u32 logical_apicid = (cluster << 16) | (1 << (phys_apicid & 0xf)); + + x86_cpu_to_logical_apicid[cpu] = logical_apicid; + + if (alloc_clustermask(cpu, cluster, cpu_to_node(cpu)) < 0) return -ENOMEM; if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) return -ENOMEM; @@ -162,10 +178,10 @@ static int x2apic_prepare_cpu(unsigned int cpu) static int x2apic_dead_cpu(unsigned int dead_cpu) { - struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu); + struct cpumask *cmsk = per_cpu(cluster_masks, dead_cpu); if (cmsk) - cpumask_clear_cpu(dead_cpu, &cmsk->mask); + cpumask_clear_cpu(dead_cpu, cmsk); free_cpumask_var(per_cpu(ipi_mask, dead_cpu)); return 0; }