diff mbox series

[RFC] smp: Add cpu unstopped mask for smp_send_stop/stop_other_cpus

Message ID 20190820100843.3028-1-hsinyi@chromium.org (mailing list archive)
State New, archived
Headers show
Series [RFC] smp: Add cpu unstopped mask for smp_send_stop/stop_other_cpus | expand

Commit Message

Hsin-Yi Wang Aug. 20, 2019, 10:08 a.m. UTC
In arm/arm64/x86, reboot IPI function uses CPU online mask to let
primary CPU know how many secondary CPUs it has to wait for in
smp_send_stop()/native_stop_other_cpus().

However, sometimes this would trigger unnecessary warnings, since
interrupts and tasks might fall on a CPU that has already executed
the reboot ipi function. This is fine since CPU is already in spinloop.
But warnings are generated since it finds that the CPU is marked as
offiline. The warnings are supposed to catch failures in normal hotplug
offline CPUs, and reboot isn't a regular hotplug. So instead of reusing
online masks, we should use a new mask in reboot IPI functions to do the
work.

Take tick broadcast for example. If broadcast and smp_send_stop()
happen together, most of the time, the CPU getting earliest broadcast
is already in spinloop and thus won't do anything. If the first
broadcast arrives to CPU that hasn't already executed reboot ipi, it
would try to IPI another CPU, but the CPU is already marked as offline,
and warning comes out:

[   22.481523] reboot: Restarting system
[   22.481608] WARNING: CPU: 4 PID: 0 at ...
.....
[   22.481980] Call trace:
[   22.481991]  tick_handle_oneshot_broadcast+0x1f8/0x214
[   22.482003]  mtk_syst_handler+0x34/0x44
[   22.482016]  __handle_irq_event_percpu+0x16c/0x28c
[   22.482026]  handle_irq_event_percpu+0x34/0x8c
[   22.482035]  handle_irq_event+0x48/0x78
[   22.482044]  handle_fasteoi_irq+0xd0/0x1a0
[   22.482054]  __handle_domain_irq+0x84/0xc4
[   22.482065]  gic_handle_irq+0x154/0x1a4
[   22.482073]  el1_irq+0xb0/0x128
[   22.482081]  __do_softirq+0x88/0x2fc
[   22.482091]  irq_exit+0xa0/0xa4
[   22.482101]  handle_IPI+0x1ac/0x2cc
[   22.482109]  gic_handle_irq+0x124/0x1a4
[   22.482117]  el1_irq+0xb0/0x128
[   22.482127]  cpuidle_enter_state+0x298/0x328
[   22.482135]  cpuidle_enter+0x30/0x40
[   22.482146]  do_idle+0x154/0x270
[   22.482154]  cpu_startup_entry+0x24/0x28
[   22.482164]  secondary_start_kernel+0x15c/0x168
[   22.482171] ---[ end trace 25f699b7e87857ff ]---

From kernel/time/tick-broadcast.c:
  /*
   * Sanity check. Catch the case where we try to broadcast to
   * offline cpus.
   */
  if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask)))
          cpumask_and(tmpmask, tmpmask, cpu_online_mask);

Signed-off-by: Hsin-Yi Wang <hsinyi@chromium.org>
---
Note
- The warning comes from arm64 device
- Previous related patches
 - https://lkml.org/lkml/2012/8/22/3
 - https://patchwork.kernel.org/patch/10535409/
---
 arch/arm/kernel/smp.c     |  9 +++++----
 arch/arm64/kernel/smp.c   | 12 +++++++-----
 arch/x86/kernel/process.c |  2 +-
 arch/x86/kernel/smp.c     |  6 +++---
 arch/x86/kernel/smpboot.c |  2 ++
 include/linux/cpumask.h   | 18 ++++++++++++++++++
 kernel/cpu.c              |  4 ++++
 7 files changed, 40 insertions(+), 13 deletions(-)

Comments

Thomas Gleixner Aug. 20, 2019, 11:24 a.m. UTC | #1
On Tue, 20 Aug 2019, Hsin-Yi Wang wrote:

> In arm/arm64/x86, reboot IPI function uses CPU online mask to let
> primary CPU know how many secondary CPUs it has to wait for in
> smp_send_stop()/native_stop_other_cpus().
> 
> However, sometimes this would trigger unnecessary warnings, since
> interrupts and tasks might fall on a CPU that has already executed
> the reboot ipi function. This is fine since CPU is already in spinloop.
> But warnings are generated since it finds that the CPU is marked as
> offiline. The warnings are supposed to catch failures in normal hotplug
> offline CPUs, and reboot isn't a regular hotplug. So instead of reusing
> online masks, we should use a new mask in reboot IPI functions to do the
> work.
> 
> Take tick broadcast for example. If broadcast and smp_send_stop()
> happen together, most of the time, the CPU getting earliest broadcast
> is already in spinloop and thus won't do anything. If the first
> broadcast arrives to CPU that hasn't already executed reboot ipi, it
> would try to IPI another CPU, but the CPU is already marked as offline,
> and warning comes out:
> 
> [   22.481523] reboot: Restarting system
> [   22.481608] WARNING: CPU: 4 PID: 0 at ...

That is really the complete wrong approach. There is no valid reason that a
regular reboot needs to use a shortcut homebrewn variant of stopping CPUs.

The proper solution is to restrict this mechansim to emergency reboots and
let the normal reboot go through the regular CPU hotplug mechanism. That
avoids all that duct tape which is just bound to break tomorrow again.

In case of an emergency reboot, we really do not care about any extra stuff
triggered. The machine is hosed already.

Thanks

	tglx
diff mbox series

Patch

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 4b0bab2607e4..18f90cea05b2 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -245,6 +245,7 @@  int __cpu_disable(void)
 	 * and we must not schedule until we're ready to give up the cpu.
 	 */
 	set_cpu_online(cpu, false);
+	set_cpu_unstopped(cpu, false);
 
 	/*
 	 * OK - migrate IRQs away from this CPU
@@ -430,6 +431,7 @@  asmlinkage void secondary_start_kernel(void)
 	 * before we continue - which happens after __cpu_up returns.
 	 */
 	set_cpu_online(cpu, true);
+	set_cpu_unstopped(cpu, true);
 
 	check_other_bugs();
 
@@ -593,11 +595,10 @@  static void ipi_cpu_stop(unsigned int cpu)
 		raw_spin_unlock(&stop_lock);
 	}
 
-	set_cpu_online(cpu, false);
-
 	local_fiq_disable();
 	local_irq_disable();
 
+	set_cpu_unstopped(cpu, false);
 	while (1) {
 		cpu_relax();
 		wfe();
@@ -713,10 +714,10 @@  void smp_send_stop(void)
 
 	/* Wait up to one second for other CPUs to stop */
 	timeout = USEC_PER_SEC;
-	while (num_online_cpus() > 1 && timeout--)
+	while (num_unstopped_cpus() > 1 && timeout--)
 		udelay(1);
 
-	if (num_online_cpus() > 1)
+	if (num_unstopped_cpus() > 1)
 		pr_warn("SMP: failed to stop secondary CPUs\n");
 }
 
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 018a33e01b0e..ff0d9fcf97ed 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -249,6 +249,7 @@  asmlinkage notrace void secondary_start_kernel(void)
 					 read_cpuid_id());
 	update_cpu_boot_status(CPU_BOOT_SUCCESS);
 	set_cpu_online(cpu, true);
+	set_cpu_unstopped(cpu, true);
 	complete(&cpu_running);
 
 	local_daif_restore(DAIF_PROCCTX);
@@ -299,6 +300,7 @@  int __cpu_disable(void)
 	 * and we must not schedule until we're ready to give up the cpu.
 	 */
 	set_cpu_online(cpu, false);
+	set_cpu_unstopped(cpu, false);
 
 	/*
 	 * OK - migrate IRQs away from this CPU
@@ -827,7 +829,7 @@  void arch_irq_work_raise(void)
 
 static void local_cpu_stop(void)
 {
-	set_cpu_online(smp_processor_id(), false);
+	set_cpu_unstopped(smp_processor_id(), false);
 
 	local_daif_mask();
 	sdei_mask_local_cpu();
@@ -957,7 +959,7 @@  void smp_send_stop(void)
 {
 	unsigned long timeout;
 
-	if (num_online_cpus() > 1) {
+	if (num_unstopped_cpus() > 1) {
 		cpumask_t mask;
 
 		cpumask_copy(&mask, cpu_online_mask);
@@ -970,12 +972,12 @@  void smp_send_stop(void)
 
 	/* Wait up to one second for other CPUs to stop */
 	timeout = USEC_PER_SEC;
-	while (num_online_cpus() > 1 && timeout--)
+	while (num_unstopped_cpus() > 1 && timeout--)
 		udelay(1);
 
-	if (num_online_cpus() > 1)
+	if (num_unstopped_cpus() > 1)
 		pr_warning("SMP: failed to stop secondary CPUs %*pbl\n",
-			   cpumask_pr_args(cpu_online_mask));
+			   cpumask_pr_args(cpu_unstopped_mask));
 
 	sdei_mask_local_cpu();
 }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5e94c4354d4e..fb286f189082 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -601,7 +601,6 @@  void stop_this_cpu(void *dummy)
 	/*
 	 * Remove this CPU:
 	 */
-	set_cpu_online(smp_processor_id(), false);
 	disable_local_APIC();
 	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
 
@@ -616,6 +615,7 @@  void stop_this_cpu(void *dummy)
 	 */
 	if (boot_cpu_has(X86_FEATURE_SME))
 		native_wbinvd();
+	set_cpu_unstopped(smp_processor_id(), false);
 	for (;;) {
 		/*
 		 * Use native_halt() so that memory contents don't change
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index b8d4e9c3c070..99daba583a9a 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -167,7 +167,7 @@  static void native_stop_other_cpus(int wait)
 	 * code.  By syncing, we give the cpus up to one second to
 	 * finish their work before we force them off with the NMI.
 	 */
-	if (num_online_cpus() > 1) {
+	if (num_unstopped_cpus() > 1) {
 		/* did someone beat us here? */
 		if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
 			return;
@@ -184,12 +184,12 @@  static void native_stop_other_cpus(int wait)
 		 * CPUs reach shutdown state.
 		 */
 		timeout = USEC_PER_SEC;
-		while (num_online_cpus() > 1 && timeout--)
+		while (num_unstopped_cpus() > 1 && timeout--)
 			udelay(1);
 	}
 
 	/* if the REBOOT_VECTOR didn't work, try with the NMI */
-	if (num_online_cpus() > 1) {
+	if (num_unstopped_cpus() > 1) {
 		/*
 		 * If NMI IPI is enabled, try to register the stop handler
 		 * and send the IPI. In any case try to wait for the other
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 69881b2d446c..2fa96cc9e7d2 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -247,6 +247,7 @@  static void notrace start_secondary(void *unused)
 	 */
 	lock_vector_lock();
 	set_cpu_online(smp_processor_id(), true);
+	set_cpu_unstopped(smp_processor_id(), true);
 	lapic_online();
 	unlock_vector_lock();
 	cpu_set_state_online(smp_processor_id());
@@ -1562,6 +1563,7 @@  static void remove_siblinginfo(int cpu)
 static void remove_cpu_from_maps(int cpu)
 {
 	set_cpu_online(cpu, false);
+	set_cpu_unstopped(cpu, false);
 	cpumask_clear_cpu(cpu, cpu_callout_mask);
 	cpumask_clear_cpu(cpu, cpu_callin_mask);
 	/* was set by cpu_init() */
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 78a73eba64dd..3cd929d4ebc8 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -89,10 +89,12 @@  extern unsigned int nr_cpu_ids;
 
 extern struct cpumask __cpu_possible_mask;
 extern struct cpumask __cpu_online_mask;
+extern struct cpumask __cpu_unstopped_mask;
 extern struct cpumask __cpu_present_mask;
 extern struct cpumask __cpu_active_mask;
 #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
 #define cpu_online_mask   ((const struct cpumask *)&__cpu_online_mask)
+#define cpu_unstopped_mask   ((const struct cpumask *)&__cpu_unstopped_mask)
 #define cpu_present_mask  ((const struct cpumask *)&__cpu_present_mask)
 #define cpu_active_mask   ((const struct cpumask *)&__cpu_active_mask)
 
@@ -111,6 +113,12 @@  static inline unsigned int num_online_cpus(void)
 {
 	return atomic_read(&__num_online_cpus);
 }
+
+static inline unsigned int num_unstopped_cpus(void)
+{
+	return atomic_read(&__cpu_unstopped_mask);
+}
+
 #define num_possible_cpus()	cpumask_weight(cpu_possible_mask)
 #define num_present_cpus()	cpumask_weight(cpu_present_mask)
 #define num_active_cpus()	cpumask_weight(cpu_active_mask)
@@ -120,6 +128,7 @@  static inline unsigned int num_online_cpus(void)
 #define cpu_active(cpu)		cpumask_test_cpu((cpu), cpu_active_mask)
 #else
 #define num_online_cpus()	1U
+#define num_unstopped_cpus()	1U
 #define num_possible_cpus()	1U
 #define num_present_cpus()	1U
 #define num_active_cpus()	1U
@@ -837,6 +846,15 @@  set_cpu_present(unsigned int cpu, bool present)
 
 void set_cpu_online(unsigned int cpu, bool online);
 
+static inline void
+set_cpu_unstopped(unsigned int cpu, bool unstopped)
+{
+	if (unstopped)
+		cpumask_set_cpu(cpu, &__cpu_unstopped_mask);
+	else
+		cpumask_clear_cpu(cpu, &__cpu_unstopped_mask);
+}
+
 static inline void
 set_cpu_active(unsigned int cpu, bool active)
 {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index e1967e9eddc2..8b95c06e674f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2292,6 +2292,9 @@  EXPORT_SYMBOL(__cpu_possible_mask);
 struct cpumask __cpu_online_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_online_mask);
 
+struct cpumask __cpu_unstopped_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_unstopped_mask);
+
 struct cpumask __cpu_present_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_present_mask);
 
@@ -2346,6 +2349,7 @@  void __init boot_cpu_init(void)
 
 	/* Mark the boot cpu "present", "online" etc for SMP and UP case */
 	set_cpu_online(cpu, true);
+	set_cpu_unstopped(cpu, true);
 	set_cpu_active(cpu, true);
 	set_cpu_present(cpu, true);
 	set_cpu_possible(cpu, true);