@@ -55,20 +55,18 @@ extern ssize_t arch_print_cpu_modalias(struct device *dev,
*/
enum {
/*
- * SCHED_ACTIVE marks a cpu which is coming up active during
- * CPU_ONLINE and CPU_DOWN_FAILED and must be the first
- * notifier. CPUSET_ACTIVE adjusts cpuset according to
- * cpu_active mask right after SCHED_ACTIVE. During
- * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are
- * ordered in the similar way.
+ * SCHED_ACTIVE marks a cpu which is coming up active during CPU_ONLINE
+ * and CPU_DOWN_FAILED and must be the first notifier. It then passes
+ * control to the cpuset_cpu_active() notifier which adjusts cpusets
+ * according to cpu_active mask. During CPU_DOWN_PREPARE, SCHED_INACTIVE
+ * marks the cpu as inactive and passes control to the
+ * cpuset_cpu_inactive() notifier in a similar way.
*
* This ordering guarantees consistent cpu_active mask and
* migration behavior to all cpu notifiers.
*/
CPU_PRI_SCHED_ACTIVE = INT_MAX,
- CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1,
- CPU_PRI_SCHED_INACTIVE = INT_MIN + 1,
- CPU_PRI_CPUSET_INACTIVE = INT_MIN,
+ CPU_PRI_SCHED_INACTIVE = INT_MIN,
/* migration should happen before other stuff but after perf */
CPU_PRI_PERF = 20,
@@ -280,6 +280,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
unsigned int sysctl_sched_rt_period = 1000000;
__read_mostly int scheduler_running;
+static bool __read_mostly sched_smp_init_complete;
/*
* part of the period that we allow rt tasks to run in us.
@@ -5505,29 +5506,75 @@ static struct notifier_block __cpuinitdata migration_notifier = {
.priority = CPU_PRI_MIGRATION,
};
+/*
+ * Update cpusets according to cpu_active mask. If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
+ */
+static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ cpuset_update_active_cpus();
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ cpuset_update_active_cpus();
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
+ int ret;
+
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_STARTING:
case CPU_DOWN_FAILED:
set_cpu_active((long)hcpu, true);
- return NOTIFY_OK;
+ ret = NOTIFY_OK;
+ break;
default:
- return NOTIFY_DONE;
+ ret = NOTIFY_DONE;
}
+
+ if (likely(sched_smp_init_complete))
+ return cpuset_cpu_active(nfb, action, hcpu);
+
+ return ret;
}
static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
+ int ret;
+
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
set_cpu_active((long)hcpu, false);
- return NOTIFY_OK;
+ ret = NOTIFY_OK;
+ break;
default:
- return NOTIFY_DONE;
+ ret = NOTIFY_DONE;
}
+
+ if (likely(sched_smp_init_complete))
+ return cpuset_cpu_inactive(nfb, action, hcpu);
+
+ return ret;
}
static int __init migration_init(void)
@@ -6967,36 +7014,6 @@ match2:
mutex_unlock(&sched_domains_mutex);
}
-/*
- * Update cpusets according to cpu_active mask. If cpusets are
- * disabled, cpuset_update_active_cpus() becomes a simple wrapper
- * around partition_sched_domains().
- */
-static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
-{
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- cpuset_update_active_cpus();
- return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
- }
-}
-
-static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
- void *hcpu)
-{
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_PREPARE:
- cpuset_update_active_cpus();
- return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
- }
-}
-
void __init sched_init_smp(void)
{
cpumask_var_t non_isolated_cpus;
@@ -7015,9 +7032,6 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
put_online_cpus();
- hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
- hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-
/* RT runtime code needs to handle some hotplug events */
hotcpu_notifier(update_runtime, 0);
@@ -7030,6 +7044,7 @@ void __init sched_init_smp(void)
free_cpumask_var(non_isolated_cpus);
init_sched_rt_class();
+ sched_smp_init_complete = true;
}
#else
void __init sched_init_smp(void)
Some of the CPU hotplug callbacks of the scheduler and cpuset infrastructure are intertwined in an interesting way. The scheduler's sched_cpu_[in]active() callbacks and cpuset's cpuset_cpu_[in]active() callbacks have the following documented dependency: The sched_cpu_active() callback must be the first callback to run, and should be immediately followed by cpuset_cpu_active() to update the cpusets and the sched domains. This ordering (sched followed by cpuset) needs to be honored in both the CPU online *and* the CPU offline paths. Hence its not straightforward to convert these callbacks to the reverse invocation model, because, a plain conversion would result in the problem explained below. In general, if 2 notifiers A and B expect to be -always- called in the order A followed by B, ie., during both CPU online and CPU offline, then we can't ensure that easily because, when we do reverse invocation, we get the following call path: Event | Invocation order -------------|--------------------- CPU online: | A (high priority), B (low priority) CPU offline: | B (low priority), A (high priority) So this breaks the requirement for A and B. We see this ordering requirement in the case of the scheduler and cpusets. So, to solve this, club the 2 callbacks together as a unit, so that they are always invoked as a unit, which means, forward or reverse, the requirement is satisfied. In this case, since the 2 callbacks are quite related, it doesn't break semantics/readability if we club them together, which is a good thing! There is a one more aspect that we need to take care of while clubbing the two callbacks. During boot, the scheduler is initialized in two phases: sched_init(), which happens before SMP initialization (and hence *before* the non-boot CPUs are booted up), and sched_init_smp(), which happens after SMP initialization (and hence *after* the non-boot CPUs are booted). In the original code, the cpuset callbacks are registered during sched_init_smp(), which means that while starting the non-boot CPUs, only the scheduler callbacks are invoked, not the cpuset ones. So in order to keep this intact even after clubbing the 2 callbacks, we need to be able to find out if we are running post-SMP init or if we are running pre-SMP early boot code, to decide whether to pass on control to the cpuset callback or not. So introduce a flag 'sched_smp_init_complete' that gets set after the scheduler is initialized for SMP. This would help us in making that decision. Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> --- include/linux/cpu.h | 16 ++++----- kernel/sched/core.c | 89 ++++++++++++++++++++++++++++++--------------------- 2 files changed, 59 insertions(+), 46 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe linux-pm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html