diff mbox

[RFC,6/6] CPU hotplug: Invoke CPU offline notifiers in reverse order

Message ID 20120725115502.3900.10942.stgit@srivatsabhat.in.ibm.com (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Srivatsa S. Bhat July 25, 2012, 11:55 a.m. UTC
During CPU hotplug, we want to create the following effect:
 * During CPU online, the CPU incrementally grows the number of services
   it offers.
 * During CPU offline, the services are incrementally retired, in the
   reverse order of their growth during CPU online.

To achieve the above, invoke the registered CPU hotplug notifiers in the
reverse order during CPU offline, with respect to the order in which they
were invoked during CPU online.

This approach helps in making the hotplug path a lot more logically
coherent: Services are started in a well-known order, perhaps with dependencies
between them, while bringing up a CPU. While offlining the CPU, we honor
many of the dependencies automatically by just backtracking the way we came;
that is, by invoking the notifiers in the reverse order. Thus, this model of
reverse notifier invocation helps us in avoiding the need for excessively
messing with the notifier priorities while trying to get the ordering right.

Notes:
1. The workqueue code becomes simpler, since it now needs just a single CPU
hotplug callback.
2. The scheduler's sched_cpu_[in]active() callbacks can also be merged into
a single callback.
3. On similar lines, the cpusets' cpuset_cpu_[in]active() callbacks can also
be merged.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
---

 include/linux/cpu.h |   37 ++++++++++++++++++++++++-------------
 kernel/cpu.c        |   34 ++++++++++++++++++++++++++--------
 kernel/sched/core.c |   48 ++++++++++++++----------------------------------
 kernel/workqueue.c  |   30 +++++++++++-------------------
 4 files changed, 75 insertions(+), 74 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Tejun Heo July 25, 2012, 4:43 p.m. UTC | #1
Hello,

On Wed, Jul 25, 2012 at 05:25:13PM +0530, Srivatsa S. Bhat wrote:
> During CPU hotplug, we want to create the following effect:
>  * During CPU online, the CPU incrementally grows the number of services
>    it offers.
>  * During CPU offline, the services are incrementally retired, in the
>    reverse order of their growth during CPU online.
> 
> To achieve the above, invoke the registered CPU hotplug notifiers in the
> reverse order during CPU offline, with respect to the order in which they
> were invoked during CPU online.
> 
> This approach helps in making the hotplug path a lot more logically
> coherent: Services are started in a well-known order, perhaps with dependencies
> between them, while bringing up a CPU. While offlining the CPU, we honor
> many of the dependencies automatically by just backtracking the way we came;
> that is, by invoking the notifiers in the reverse order. Thus, this model of
> reverse notifier invocation helps us in avoiding the need for excessively
> messing with the notifier priorities while trying to get the ordering right.
> 
> Notes:
> 1. The workqueue code becomes simpler, since it now needs just a single CPU
> hotplug callback.
> 2. The scheduler's sched_cpu_[in]active() callbacks can also be merged into
> a single callback.
> 3. On similar lines, the cpusets' cpuset_cpu_[in]active() callbacks can also
> be merged.
> 
> Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>

For workqueue part,

 Acked-by: Tejun Heo <tj@kernel.org>

Now that we can reverse-walk notifier chain, it probably is a good
idea to make it a bit smarter and call CANCELs in reverse order of
PREPAREs too.

Thanks.
diff mbox

Patch

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 88de47d..b27a046 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -55,26 +55,37 @@  extern ssize_t arch_print_cpu_modalias(struct device *dev,
  */
 enum {
 	/*
-	 * SCHED_ACTIVE marks a cpu which is coming up active during CPU_ONLINE
-	 * and CPU_DOWN_FAILED and must be the first notifier.  It then passes
-	 * control to the cpuset_cpu_active() notifier which adjusts cpusets
-	 * according to cpu_active mask. During CPU_DOWN_PREPARE, SCHED_INACTIVE
-	 * marks the cpu as inactive and passes control to the
-	 * cpuset_cpu_inactive() notifier in a similar way.
+	 * sched_cpu_notify() marks a cpu which is coming up active during
+	 * CPU_ONLINE and CPU_DOWN_FAILED and must be the first notifier.  It
+	 * then passes control to the cpuset_cpu_active() notifier which adjusts
+	 * cpusets according to cpu_active mask. During CPU_DOWN_PREPARE,
+	 * sched_cpu_notify() marks the cpu as inactive and passes control to
+	 * the cpuset_cpu_inactive() notifier in a similar way.
 	 *
 	 * This ordering guarantees consistent cpu_active mask and
 	 * migration behavior to all cpu notifiers.
 	 */
-	CPU_PRI_SCHED_ACTIVE	= INT_MAX,
-	CPU_PRI_SCHED_INACTIVE	= INT_MIN,
+	CPU_PRI_SCHED		= INT_MAX,
 
-	/* migration should happen before other stuff but after perf */
+	/*
+	 * Migration should happen before other stuff but after perf,
+	 * in the CPU online path.
+	 */
 	CPU_PRI_PERF		= 20,
 	CPU_PRI_MIGRATION_UP	= 10,
-	CPU_PRI_MIGRATION_DOWN	= 9,
-	/* bring up workqueues before normal notifiers and down after */
-	CPU_PRI_WORKQUEUE_UP	= 5,
-	CPU_PRI_WORKQUEUE_DOWN	= -5,
+
+	/*
+	 * Bring up workqueues before normal notifiers in the CPU online
+	 * path. Do the opposite in the CPU offline path.
+	 */
+	CPU_PRI_WORKQUEUE	= 5,
+
+	/*
+	 * In the CPU offline path, the notifiers are invoked in the reverse
+	 * order. So use a numerically low value for the priority, to ensure
+	 * that we get invoked early.
+	 */
+	CPU_PRI_MIGRATION_DOWN	= -10,
 };
 
 #define CPU_ONLINE		0x0002 /* CPU (unsigned)v is up */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ab33ae..30444c2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -155,16 +155,32 @@  static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
 	return notifier_to_errno(ret);
 }
 
+static int __cpu_notify_reverse(unsigned long val, void *v, int nr_to_call,
+			int *nr_calls)
+{
+	int ret;
+
+	ret = __raw_notifier_call_chain_reverse(&cpu_chain, val, v,
+							nr_to_call, nr_calls);
+
+	return notifier_to_errno(ret);
+}
+
 static int cpu_notify(unsigned long val, void *v)
 {
 	return __cpu_notify(val, v, -1, NULL);
 }
 
+static int cpu_notify_reverse(unsigned long val, void *v)
+{
+	return __cpu_notify_reverse(val, v, -1, NULL);
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 
-static void cpu_notify_nofail(unsigned long val, void *v)
+static void cpu_notify_reverse_nofail(unsigned long val, void *v)
 {
-	BUG_ON(cpu_notify(val, v));
+	BUG_ON(cpu_notify_reverse(val, v));
 }
 EXPORT_SYMBOL(register_cpu_notifier);
 
@@ -249,7 +265,7 @@  static int __ref take_cpu_down(void *_param)
 	if (err < 0)
 		return err;
 
-	cpu_notify(CPU_DYING | param->mod, param->hcpu);
+	cpu_notify_reverse(CPU_DYING | param->mod, param->hcpu);
 	return 0;
 }
 
@@ -272,10 +288,12 @@  static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 
 	cpu_hotplug_begin();
 
-	err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
+	err = __cpu_notify_reverse(CPU_DOWN_PREPARE | mod, hcpu, -1,
+								&nr_calls);
 	if (err) {
 		nr_calls--;
-		__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
+		__cpu_notify_reverse(CPU_DOWN_FAILED | mod, hcpu, nr_calls,
+									NULL);
 		printk("%s: attempt to take down CPU %u failed\n",
 				__func__, cpu);
 		goto out_release;
@@ -286,7 +304,7 @@  static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		smpboot_unpark_threads(cpu);
-		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
+		cpu_notify_reverse_nofail(CPU_DOWN_FAILED | mod, hcpu);
 		goto out_release;
 	}
 	BUG_ON(cpu_online(cpu));
@@ -305,14 +323,14 @@  static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	__cpu_die(cpu);
 
 	/* CPU is completely dead: tell everyone.  Too late to complain. */
-	cpu_notify_nofail(CPU_DEAD | mod, hcpu);
+	cpu_notify_reverse_nofail(CPU_DEAD | mod, hcpu);
 
 	check_for_tasks(cpu);
 
 out_release:
 	cpu_hotplug_done();
 	if (!err)
-		cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
+		cpu_notify_reverse_nofail(CPU_POST_DEAD | mod, hcpu);
 	return err;
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6a511f2..9dbcb11 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5477,6 +5477,9 @@  cpu_up_migration_call(struct notifier_block *nfb, unsigned long action,
 }
 
 /*
+ * CPU online path: Notifiers are invoked in normal order; that is, notifiers
+ * with numerically higher priorities are invoked first.
+ *
  * Register at high priority so that task migration (migrate_tasks)
  * happens before everything else.  This has to be lower priority than
  * the notifier in the perf_event subsystem, though.
@@ -5525,9 +5528,11 @@  cpu_down_migration_call(struct notifier_block *nfb, unsigned long action,
 }
 
 /*
- * Register at high priority so that task migration (migrate_tasks)
- * happens before everything else.  This has to be lower priority than
- * the notifier in the perf_event subsystem, though.
+ * CPU offline path: Notifiers are invoked in reverse order; that is,
+ * notifiers with numerically lower priorities are invoked first.
+ *
+ * Register at low priority so that task migration (migrate_tasks)
+ * happens before everything else.
  */
 static struct notifier_block __cpuinitdata cpu_down_migration_notifier = {
 	.notifier_call = cpu_down_migration_call,
@@ -5539,32 +5544,22 @@  static struct notifier_block __cpuinitdata cpu_down_migration_notifier = {
  * disabled, cpuset_update_active_cpus() becomes a simple wrapper
  * around partition_sched_domains().
  */
-static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+static int cpuset_cpu_notify(struct notifier_block *nfb, unsigned long action,
 			     void *hcpu)
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
-		cpuset_update_active_cpus();
-		return NOTIFY_OK;
-	default:
-		return NOTIFY_DONE;
-	}
-}
-
-static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
-			       void *hcpu)
-{
-	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
 		cpuset_update_active_cpus();
 		return NOTIFY_OK;
+
 	default:
 		return NOTIFY_DONE;
 	}
 }
 
-static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+static int __cpuinit sched_cpu_notify(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
 {
 	int ret;
@@ -5575,32 +5570,18 @@  static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
 		set_cpu_active((long)hcpu, true);
 		ret = NOTIFY_OK;
 		break;
-	default:
-		ret = NOTIFY_DONE;
-	}
-
-	if (likely(sched_smp_init_complete))
-		return cpuset_cpu_active(nfb, action, hcpu);
-
-	return ret;
-}
 
-static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
-					unsigned long action, void *hcpu)
-{
-	int ret;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
 		set_cpu_active((long)hcpu, false);
 		ret = NOTIFY_OK;
 		break;
+
 	default:
 		ret = NOTIFY_DONE;
 	}
 
 	if (likely(sched_smp_init_complete))
-		return cpuset_cpu_inactive(nfb, action, hcpu);
+		return cpuset_cpu_notify(nfb, action, hcpu);
 
 	return ret;
 }
@@ -5620,8 +5601,7 @@  static int __init migration_init(void)
 	register_cpu_notifier(&cpu_down_migration_notifier);
 
 	/* Register cpu active notifiers */
-	cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
-	cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
+	cpu_notifier(sched_cpu_notify, CPU_PRI_SCHED);
 
 	return 0;
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 471996a..5fba1da 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3429,16 +3429,24 @@  static void gcwq_unbind_fn(struct work_struct *work)
 }
 
 /*
+ * CPU online:
  * Workqueues should be brought up before normal priority CPU notifiers.
- * This will be registered high priority CPU notifier.
+ *
+ * CPU offline:
+ * Workqueues should be brought down after normal priority CPU notifiers.
+ *
+ * This will be registered as high priority CPU notifier. Then, the automatic
+ * reverse invocation of notifiers in the CPU offline path will satisfy both
+ * the above requirements.
  */
-static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 					       unsigned long action,
 					       void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
 	struct global_cwq *gcwq = get_gcwq(cpu);
 	struct worker_pool *pool;
+	struct work_struct unbind_work;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
@@ -3465,22 +3473,7 @@  static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb,
 		rebind_workers(gcwq);
 		gcwq_release_management_and_unlock(gcwq);
 		break;
-	}
-	return NOTIFY_OK;
-}
 
-/*
- * Workqueues should be brought down after normal priority CPU notifiers.
- * This will be registered as low priority CPU notifier.
- */
-static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
-						 unsigned long action,
-						 void *hcpu)
-{
-	unsigned int cpu = (unsigned long)hcpu;
-	struct work_struct unbind_work;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
 		/* unbinding should happen on the local CPU */
 		INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn);
@@ -3686,8 +3679,7 @@  static int __init init_workqueues(void)
 	unsigned int cpu;
 	int i;
 
-	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
-	cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
+	cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE);
 
 	/* initialize gcwqs */
 	for_each_gcwq_cpu(cpu) {