@@ -617,15 +617,6 @@ csched_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
{
unsigned long flags;
struct csched_private *prv = CSCHED_PRIV(ops);
- struct schedule_data *sd = &per_cpu(schedule_data, cpu);
-
- /*
- * This is called either during during boot, resume or hotplug, in
- * case Credit1 is the scheduler chosen at boot. In such cases, the
- * scheduler lock for cpu is already pointing to the default per-cpu
- * spinlock, as Credit1 needs it, so there is no remapping to be done.
- */
- ASSERT(sd->schedule_lock == &sd->_lock && !spin_is_locked(&sd->_lock));
spin_lock_irqsave(&prv->lock, flags);
init_pdata(prv, pdata, cpu);
@@ -167,17 +167,10 @@ static void init_pdata(struct null_private *prv, unsigned int cpu)
static void null_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
{
struct null_private *prv = null_priv(ops);
- struct schedule_data *sd = &per_cpu(schedule_data, cpu);
/* alloc_pdata is not implemented, so we want this to be NULL. */
ASSERT(!pdata);
- /*
- * The scheduler lock points already to the default per-cpu spinlock,
- * so there is no remapping to be done.
- */
- ASSERT(sd->schedule_lock == &sd->_lock && !spin_is_locked(&sd->_lock));
-
init_pdata(prv, cpu);
}
@@ -54,6 +54,10 @@ boolean_param("sched_smt_power_savings", sched_smt_power_savings);
* */
int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
integer_param("sched_ratelimit_us", sched_ratelimit_us);
+
+/* Common lock for free cpus. */
+static DEFINE_SPINLOCK(sched_free_cpu_lock);
+
/* Various timer handlers. */
static void s_timer_fn(void *unused);
static void vcpu_periodic_timer_fn(void *data);
@@ -73,6 +77,58 @@ extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_arr
static struct scheduler __read_mostly ops;
+static spinlock_t *
+sched_idle_switch_sched(struct scheduler *new_ops, unsigned int cpu,
+ void *pdata, void *vdata)
+{
+ idle_vcpu[cpu]->sched_priv = NULL;
+
+ return &sched_free_cpu_lock;
+}
+
+static int
+sched_idle_cpu_pick(const struct scheduler *ops, struct vcpu *v)
+{
+ return v->processor;
+}
+
+static void *
+sched_idle_alloc_vdata(const struct scheduler *ops, struct vcpu *v,
+ void *dd)
+{
+ /* Any non-NULL pointer is fine here. */
+ return (void *)1UL;
+}
+
+static void
+sched_idle_free_vdata(const struct scheduler *ops, void *priv)
+{
+}
+
+static struct task_slice sched_idle_schedule(
+ const struct scheduler *ops, s_time_t now,
+ bool tasklet_work_scheduled)
+{
+ const unsigned int cpu = smp_processor_id();
+ struct task_slice ret = { .time = -1 };
+
+ ret.task = idle_vcpu[cpu];
+ return ret;
+}
+
+static struct scheduler sched_idle_ops = {
+ .name = "Idle Scheduler",
+ .opt_name = "idle",
+ .sched_data = NULL,
+
+ .pick_cpu = sched_idle_cpu_pick,
+ .do_schedule = sched_idle_schedule,
+
+ .alloc_vdata = sched_idle_alloc_vdata,
+ .free_vdata = sched_idle_free_vdata,
+ .switch_sched = sched_idle_switch_sched,
+};
+
static inline struct scheduler *dom_scheduler(const struct domain *d)
{
if ( likely(d->cpupool != NULL) )
@@ -1587,12 +1643,10 @@ static void poll_timer_fn(void *data)
static int cpu_schedule_up(unsigned int cpu)
{
struct schedule_data *sd = &per_cpu(schedule_data, cpu);
- void *sched_priv;
- per_cpu(scheduler, cpu) = &ops;
+ per_cpu(scheduler, cpu) = &sched_idle_ops;
spin_lock_init(&sd->_lock);
- sd->schedule_lock = &sd->_lock;
- sd->curr = idle_vcpu[cpu];
+ sd->schedule_lock = &sched_free_cpu_lock;
init_timer(&sd->s_timer, s_timer_fn, NULL, cpu);
atomic_set(&sd->urgent_count, 0);
@@ -1602,40 +1656,19 @@ static int cpu_schedule_up(unsigned int cpu)
if ( idle_vcpu[cpu] == NULL )
vcpu_create(idle_vcpu[0]->domain, cpu, cpu);
- else
- {
- struct vcpu *idle = idle_vcpu[cpu];
-
- /*
- * During (ACPI?) suspend the idle vCPU for this pCPU is not freed,
- * while its scheduler specific data (what is pointed by sched_priv)
- * is. Also, at this stage of the resume path, we attach the pCPU
- * to the default scheduler, no matter in what cpupool it was before
- * suspend. To avoid inconsistency, let's allocate default scheduler
- * data for the idle vCPU here. If the pCPU was in a different pool
- * with a different scheduler, it is schedule_cpu_switch(), invoked
- * later, that will set things up as appropriate.
- */
- ASSERT(idle->sched_priv == NULL);
- idle->sched_priv = sched_alloc_vdata(&ops, idle,
- idle->domain->sched_priv);
- if ( idle->sched_priv == NULL )
- return -ENOMEM;
- }
if ( idle_vcpu[cpu] == NULL )
return -ENOMEM;
/*
- * We don't want to risk calling xfree() on an sd->sched_priv
- * (e.g., inside free_pdata, from cpu_schedule_down() called
- * during CPU_UP_CANCELLED) that contains an IS_ERR value.
+ * No need to allocate any scheduler data, as cpus coming online are
+ * free initially and the idle scheduler doesn't need any data areas
+ * allocated.
*/
- sched_priv = sched_alloc_pdata(&ops, cpu);
- if ( IS_ERR(sched_priv) )
- return PTR_ERR(sched_priv);
- sd->sched_priv = sched_priv;
+ sd->curr = idle_vcpu[cpu];
+
+ sd->sched_priv = NULL;
return 0;
}
@@ -1643,13 +1676,6 @@ static int cpu_schedule_up(unsigned int cpu)
static void cpu_schedule_down(unsigned int cpu)
{
struct schedule_data *sd = &per_cpu(schedule_data, cpu);
- struct scheduler *sched = per_cpu(scheduler, cpu);
-
- sched_free_pdata(sched, sd->sched_priv, cpu);
- sched_free_vdata(sched, idle_vcpu[cpu]->sched_priv);
-
- idle_vcpu[cpu]->sched_priv = NULL;
- sd->sched_priv = NULL;
kill_timer(&sd->s_timer);
}
@@ -1657,14 +1683,11 @@ static void cpu_schedule_down(unsigned int cpu)
void sched_rm_cpu(unsigned int cpu)
{
int rc;
- struct schedule_data *sd = &per_cpu(schedule_data, cpu);
- struct scheduler *sched = per_cpu(scheduler, cpu);
rcu_read_lock(&domlist_read_lock);
rc = cpu_disable_scheduler(cpu);
BUG_ON(rc);
rcu_read_unlock(&domlist_read_lock);
- sched_deinit_pdata(sched, sd->sched_priv, cpu);
cpu_schedule_down(cpu);
}
@@ -1672,8 +1695,6 @@ static int cpu_schedule_callback(
struct notifier_block *nfb, unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
- struct scheduler *sched = per_cpu(scheduler, cpu);
- struct schedule_data *sd = &per_cpu(schedule_data, cpu);
int rc = 0;
/*
@@ -1681,39 +1702,25 @@ static int cpu_schedule_callback(
* allocating and initializing the per-pCPU scheduler specific data,
* as well as "registering" this pCPU to the scheduler (which may
* involve modifying some scheduler wide data structures).
- * This happens by calling the alloc_pdata and init_pdata hooks, in
- * this order. A scheduler that does not need to allocate any per-pCPU
- * data can avoid implementing alloc_pdata. init_pdata may, however, be
- * necessary/useful in this case too (e.g., it can contain the "register
- * the pCPU to the scheduler" part). alloc_pdata (if present) is called
- * during CPU_UP_PREPARE. init_pdata (if present) is called during
- * CPU_STARTING.
+ * As new pCPUs always start as "free" cpus with the minimal idle
+ * scheduler being in charge, we don't need any of that.
*
* On the other hand, at teardown, we need to reverse what has been done
- * during initialization, and then free the per-pCPU specific data. This
- * happens by calling the deinit_pdata and free_pdata hooks, in this
+ * during initialization, and then free the per-pCPU specific data. A
+ * pCPU brought down is not forced through "free" cpus, so here we need to
+ * use the appropriate hooks.
+ *
+ * This happens by calling the deinit_pdata and free_pdata hooks, in this
* order. If no per-pCPU memory was allocated, there is no need to
* provide an implementation of free_pdata. deinit_pdata may, however,
* be necessary/useful in this case too (e.g., it can undo something done
* on scheduler wide data structure during init_pdata). Both deinit_pdata
* and free_pdata are called during CPU_DEAD.
*
- * If someting goes wrong during bringup, we go to CPU_UP_CANCELLED
- * *before* having called init_pdata. In this case, as there is no
- * initialization needing undoing, only free_pdata should be called.
- * This means it is possible to call free_pdata just after alloc_pdata,
- * without a init_pdata/deinit_pdata "cycle" in between the two.
- *
- * So, in summary, the usage pattern should look either
- * - alloc_pdata-->init_pdata-->deinit_pdata-->free_pdata, or
- * - alloc_pdata-->free_pdata.
+ * If someting goes wrong during bringup, we go to CPU_UP_CANCELLED.
*/
switch ( action )
{
- case CPU_STARTING:
- if ( system_state != SYS_STATE_resume )
- sched_init_pdata(sched, sd->sched_priv, cpu);
- break;
case CPU_UP_PREPARE:
if ( system_state != SYS_STATE_resume )
rc = cpu_schedule_up(cpu);
@@ -1824,9 +1831,7 @@ void __init scheduler_init(void)
idle_domain->max_vcpus = nr_cpu_ids;
if ( vcpu_create(idle_domain, 0, 0) == NULL )
BUG();
- this_cpu(schedule_data).sched_priv = sched_alloc_pdata(&ops, 0);
- BUG_ON(IS_ERR(this_cpu(schedule_data).sched_priv));
- sched_init_pdata(&ops, this_cpu(schedule_data).sched_priv, 0);
+ this_cpu(schedule_data).curr = idle_vcpu[0];
}
/*
@@ -1834,18 +1839,14 @@ void __init scheduler_init(void)
* cpupool, or subject it to the scheduler of a new cpupool.
*
* For the pCPUs that are removed from their cpupool, their scheduler becomes
- * &ops (the default scheduler, selected at boot, which also services the
- * default cpupool). However, as these pCPUs are not really part of any pool,
- * there won't be any scheduling event on them, not even from the default
- * scheduler. Basically, they will just sit idle until they are explicitly
- * added back to a cpupool.
+ * &sched_idle_ops (the idle scheduler).
*/
int schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
{
struct vcpu *idle;
void *ppriv, *ppriv_old, *vpriv, *vpriv_old;
struct scheduler *old_ops = per_cpu(scheduler, cpu);
- struct scheduler *new_ops = (c == NULL) ? &ops : c->sched;
+ struct scheduler *new_ops = (c == NULL) ? &sched_idle_ops : c->sched;
struct cpupool *old_pool = per_cpu(cpupool, cpu);
struct schedule_data *sd = &per_cpu(schedule_data, cpu);
spinlock_t *old_lock, *new_lock;
@@ -1865,9 +1866,6 @@ int schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
ASSERT((c == NULL && !cpumask_test_cpu(cpu, old_pool->cpu_valid)) ||
(c != NULL && !cpumask_test_cpu(cpu, c->cpu_valid)));
- if ( old_ops == new_ops )
- goto out;
-
/*
* To setup the cpu for the new scheduler we need:
* - a valid instance of per-CPU scheduler specific data, as it is
@@ -1931,7 +1929,6 @@ int schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
sched_free_vdata(old_ops, vpriv_old);
sched_free_pdata(old_ops, ppriv_old, cpu);
- out:
per_cpu(cpupool, cpu) = c;
/* When a cpu is added to a pool, trigger it to go pick up some work */
if ( c != NULL )
Instead of having a full blown scheduler running for the free cpus add a very minimalistic scheduler for that purpose only ever scheduling the related idle vcpu. This has the big advantage of not needing any per-cpu, per-domain or per-scheduling unit data for free cpus and in turn simplifying moving cpus to and from cpupools a lot. This new scheduler will just use a common lock for all free cpus. As this new scheduler is not user selectable don't register it as an official scheduler, but just include it in schedule.c. Signed-off-by: Juergen Gross <jgross@suse.com> --- xen/common/sched_credit.c | 9 --- xen/common/sched_null.c | 7 --- xen/common/schedule.c | 153 +++++++++++++++++++++++----------------------- 3 files changed, 75 insertions(+), 94 deletions(-)