patch for virtual machine oriented scheduling(3)

Message ID	820ac2e90904220755u697ed443t60fb6dd36116bad3@mail.gmail.com (mailing list archive)
State	New, archived
Headers	show Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id n3MEtVbw019422 for <patchwork-kvm@patchwork.kernel.org>; Wed, 22 Apr 2009 14:55:31 GMT DomainKey-Signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=mime-version:date:message-id:subject:from:to:content-type :content-transfer-encoding; b=tMHwO0tzSd9h7IoxkH82jGV+liq/LuBtQgtqrMuk1z8YAf0ugIMpscUPc/xWNqBdNK DtWWE1ym71VPtgJPuOB52L0P2JdKYrwH0fDtnIo3QB595bLC+SWKADAz5oVFctbVLuXM yy6LW/h36SMKVpezPyC5mQvdtW7el300bFRkU= MIME-Version: 1.0 Date: Wed, 22 Apr 2009 22:55:24 +0800 Message-ID: <820ac2e90904220755u697ed443t60fb6dd36116bad3@mail.gmail.com> Subject: patch for virtual machine oriented scheduling(3) From: alex <tomorrowanewday@gmail.com> To: avi@redhat.com, anthony@codemonkey.ws, kvm@vger.kernel.org Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit Sender: kvm-owner@vger.kernel.org Precedence: bulk

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4d76bb6..9e88ff0 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -284,7 +284,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, "dest_mode 0x%x, short_hand 0x%x\n", target, source, dest, dest_mode, short_hand); - ASSERT(!target); + ASSERT(target); switch (short_hand) { case APIC_DEST_NOSHORT: if (dest_mode == 0) diff --git a/arch/x86/kvm/sched_credit.c b/arch/x86/kvm/sched_credit.c new file mode 100644 index 0000000..77c726a --- /dev/null +++ b/arch/x86/kvm/sched_credit.c @@ -0,0 +1,1604 @@ +/**************************************************************************** + * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc. + * (C) 2009 - Xiaojian Liu + **************************************************************************** + * + * File: common/csched_credit.c + * Author: Emmanuel Ackaouy + * Updated by Xiaojian Liu to make it runnable under KVM + * + * Description: Credit-based SMP CPU scheduler + */ +#include <linux/cpumask.h> +#include <linux/kvm_host.h> +#include <linux/trace.h> +#include <linux/sched-if.h> +#include <linux/ipi.h> +#include <linux/hardirq.h> + +#ifdef DEBUG +#define ASSERT(x) \ +do { \ + if (!(x)) { \ + printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ + __FILE__, __LINE__, #x); \ + BUG(); \ + } \ +} while (0) +#else +#define ASSERT(x) do { } while (0) +#endif +/* + * CSCHED_STATS + * + * Manage very basic counters and stats. + * + * Useful for debugging live systems. The stats are displayed + * with runq dumps ('r' on the Xen console). + */ +#define CSCHED_STATS + +#define MILLISECS(_ms) ((s_time_t)((_ms) * 1000000ULL)) + +/* + * Basic constants + */ +#define CSCHED_DEFAULT_WEIGHT 256 +#define CSCHED_TICKS_PER_TSLICE 3 +#define CSCHED_TICKS_PER_ACCT 3 +#define CSCHED_MSECS_PER_TICK 10 +#define CSCHED_MSECS_PER_TSLICE \ + (CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_TSLICE) +#define CSCHED_CREDITS_PER_TICK 100 +#define CSCHED_CREDITS_PER_TSLICE \ + (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_TSLICE) +#define CSCHED_CREDITS_PER_ACCT \ + (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_ACCT) + + +/* + * Priorities + */ +#define CSCHED_PRI_TS_BOOST 0 /* time-share waking up */ +#define CSCHED_PRI_TS_UNDER -1 /* time-share w/ credits */ +#define CSCHED_PRI_TS_OVER -2 /* time-share w/o credits */ +#define CSCHED_PRI_IDLE -64 /* idle */ + + +/* + * Flags + */ +#define CSCHED_FLAG_VCPU_PARKED 0x0001 /* VCPU over capped credits */ + + +/* + * Useful macros + */ +#define CSCHED_PCPU(_c) \ + ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv) +#define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv) +#define CSCHED_VM(_dom) ((struct csched_vm *) (_dom)->sched_priv) +#define RUNQ(_cpu) (&(CSCHED_PCPU(_cpu)->runq)) + + +/* + * Stats + */ +#ifdef CSCHED_STATS + +#define CSCHED_STAT(_X) (csched_priv.stats._X) +#define CSCHED_STAT_DEFINE(_X) uint32_t _X; +#define CSCHED_STAT_PRINTK(_X) \ + do \ + { \ + printk("\t%-30s = %u\n", #_X, CSCHED_STAT(_X)); \ + } while ( 0 ); + +/* + * Try and keep often cranked stats on top so they'll fit on one + * cache line. + */ +#define CSCHED_STATS_EXPAND_SCHED(_MACRO) \ + _MACRO(schedule) \ + _MACRO(acct_run) \ + _MACRO(acct_no_work) \ + _MACRO(acct_balance) \ + _MACRO(acct_reorder) \ + _MACRO(acct_min_credit) \ + _MACRO(acct_vcpu_active) \ + _MACRO(acct_vcpu_idle) \ + _MACRO(vcpu_sleep) \ + _MACRO(vcpu_wake_running) \ + _MACRO(vcpu_wake_onrunq) \ + _MACRO(vcpu_wake_runnable) \ + _MACRO(vcpu_wake_not_runnable) \ + _MACRO(vcpu_park) \ + _MACRO(vcpu_unpark) \ + _MACRO(tickle_local_idler) \ + _MACRO(tickle_local_over) \ + _MACRO(tickle_local_under) \ + _MACRO(tickle_local_other) \ + _MACRO(tickle_idlers_none) \ + _MACRO(tickle_idlers_some) \ + _MACRO(load_balance_idle) \ + _MACRO(load_balance_over) \ + _MACRO(load_balance_other) \ + _MACRO(steal_trylock_failed) \ + _MACRO(steal_peer_idle) \ + _MACRO(migrate_queued) \ + _MACRO(migrate_running) \ + _MACRO(dom_init) \ + _MACRO(dom_destroy) \ + _MACRO(vcpu_init) \ + _MACRO(vcpu_destroy) + +#ifndef NDEBUG +#define CSCHED_STATS_EXPAND_CHECKS(_MACRO) \ + _MACRO(vcpu_check) +#else +#define CSCHED_STATS_EXPAND_CHECKS(_MACRO) +#endif + +#define CSCHED_STATS_EXPAND(_MACRO) \ + CSCHED_STATS_EXPAND_CHECKS(_MACRO) \ + CSCHED_STATS_EXPAND_SCHED(_MACRO) + +#define CSCHED_STATS_RESET() \ + do \ + { \ + memset(&csched_priv.stats, 0, sizeof(csched_priv.stats)); \ + } while ( 0 ) + +#define CSCHED_STATS_DEFINE() \ + struct \ + { \ + CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \ + } stats; + +#define CSCHED_STATS_PRINTK() \ + do \ + { \ + printk("stats:\n"); \ + CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \ + } while ( 0 ) + +#define CSCHED_STAT_CRANK(_X) (CSCHED_STAT(_X)++) + +#define CSCHED_VCPU_STATS_RESET(_V) \ + do \ + { \ + memset(&(_V)->stats, 0, sizeof((_V)->stats)); \ + } while ( 0 ) + +#define CSCHED_VCPU_STAT_CRANK(_V, _X) (((_V)->stats._X)++) + +#define CSCHED_VCPU_STAT_SET(_V, _X, _Y) (((_V)->stats._X) = (_Y)) + +#else /* CSCHED_STATS */ + +#define CSCHED_STATS_RESET() do {} while ( 0 ) +#define CSCHED_STATS_DEFINE() +#define CSCHED_STATS_PRINTK() do {} while ( 0 ) +#define CSCHED_STAT_CRANK(_X) do {} while ( 0 ) +#define CSCHED_VCPU_STATS_RESET(_V) do {} while ( 0 ) +#define CSCHED_VCPU_STAT_CRANK(_V, _X) do {} while ( 0 ) +#define CSCHED_VCPU_STAT_SET(_V, _X, _Y) do {} while ( 0 ) + +#endif /* CSCHED_STATS */ + + +/* + * Physical CPU + */ +struct csched_pcpu { + struct list_head runq; + uint32_t runq_sort_last; + struct hrtimer ticker; + unsigned int tick; + int cpu; + volatile bool in_use; +}; + +/* + * Virtual CPU + */ +struct csched_vcpu { + struct list_head runq_elem; + struct list_head active_vcpu_elem; + struct csched_vm *sdom; + struct kvm_vcpu *vcpu; + atomic_t credit; + uint16_t flags; + int16_t pri; +#ifdef CSCHED_STATS + struct { + int credit_last; + uint32_t credit_incr; + uint32_t state_active; + uint32_t state_idle; + uint32_t migrate_q; + uint32_t migrate_r; + } stats; +#endif +}; + +/* + * Domain + */ +struct csched_vm{ + struct list_head active_vcpu; + struct list_head active_sdom_elem; + struct kvm *vm; + uint16_t active_vcpu_count; + uint16_t weight; + uint16_t cap; +}; + +/* + * System-wide private data + */ +struct csched_private { + spinlock_t lock; + struct list_head active_sdom; + uint32_t ncpus; + unsigned int master; + cpumask_t idlers; + uint32_t weight; + uint32_t credit; + int credit_balance; + uint32_t runq_sort; + CSCHED_STATS_DEFINE() +}; + +/* + * Global variables + */ +static struct csched_private csched_priv; + +static enum hrtimer_restart csched_tick_stub(struct hrtimer *data); +DECLARE_PER_CPU(cpumask_t, cpu_core_map); +DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); + +extern void vcpu_pause(struct kvm_vcpu *v); +extern void vcpu_unpause(struct kvm_vcpu *v); +extern void vcpu_pause_nosync(struct kvm_vcpu *v); + +static void csched_disable(int cpu) +{ + struct csched_pcpu *spc = CSCHED_PCPU(cpu); + tasklet_kill(&per_cpu(schedule_data, cpu).tick_tasklet); +} +void kvm_try_tasklet_schedule(void *_unused) +{ + int cpu = get_cpu(); + TRACE_2D(TRC_TASKLET_SCHEDULE, cpu, __LINE__); + if (!spin_is_locked(&per_cpu(schedule_data, cpu).schedule_lock)){ + tasklet_schedule(&per_cpu(schedule_data, cpu).sched_tasklet); + TRACE_2D(TRC_TASKLET_SCHEDULE, cpu, __LINE__); + } + put_cpu(); +} +static inline int +__cycle_cpu(int cpu, const cpumask_t *mask) +{ + int nxt = next_cpu(cpu, *mask); + if (nxt == NR_CPUS) + nxt = first_cpu(*mask); + return nxt; +} + +static inline int +__vcpu_on_runq(struct csched_vcpu *svc) +{ + return !list_empty(&svc->runq_elem); +} + +static inline struct csched_vcpu * +__runq_elem(struct list_head *elem) +{ + return list_entry(elem, struct csched_vcpu, runq_elem); +} + +static inline void +__runq_insert(unsigned int cpu, struct csched_vcpu *svc) +{ + const struct list_head * const runq = RUNQ(cpu); + struct list_head *iter; + + BUG_ON( __vcpu_on_runq(svc) ); + BUG_ON( cpu != svc->vcpu->processor); + + list_for_each( iter, runq ) + { + const struct csched_vcpu * const iter_svc = __runq_elem(iter); + if ( svc->pri > iter_svc->pri ) + break; + } + + list_add_tail(&svc->runq_elem, iter); +} + +static inline void +__runq_remove(struct csched_vcpu *svc) +{ + BUG_ON( !__vcpu_on_runq(svc) ); + list_del_init(&svc->runq_elem); +} + +static inline void +__runq_tickle(unsigned int cpu, struct csched_vcpu *new) +{ + struct csched_vcpu * const cur = + CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); + cpumask_var_t cpus; + + int my_cpu = get_cpu(); + + ASSERT(cur); + if(alloc_cpumask_var(&cpus, GFP_ATOMIC)) + cpumask_clear(cpus); + + /* If strictly higher priority than current VCPU, signal the CPU */ + if ( new->pri > cur->pri ) + { + if ( cur->pri == CSCHED_PRI_IDLE ) + CSCHED_STAT_CRANK(tickle_local_idler); + else if ( cur->pri == CSCHED_PRI_TS_OVER ) + CSCHED_STAT_CRANK(tickle_local_over); + else if ( cur->pri == CSCHED_PRI_TS_UNDER ) + CSCHED_STAT_CRANK(tickle_local_under); + else + CSCHED_STAT_CRANK(tickle_local_other); + + cpumask_set_cpu(cpu, cpus); + } + + /* + * If this CPU has at least two runnable VCPUs, we tickle any idlers to + * let them know there is runnable work in the system... + */ + if ( cur->pri > CSCHED_PRI_IDLE ) + { + if ( cpumask_empty(&csched_priv.idlers) ) + { + CSCHED_STAT_CRANK(tickle_idlers_none); + } + else + { + CSCHED_STAT_CRANK(tickle_idlers_some); + cpumask_or(cpus, cpus, &csched_priv.idlers); + cpumask_and(cpus, cpus, &new->vcpu->cpu_affinity); + } + } + + /* Send scheduler interrupts to designated CPUs */ + if (cpu_isset(my_cpu, *cpus)){ + kvm_try_tasklet_schedule(NULL); + cpu_clear(my_cpu, cpus); + } + + if ( !cpumask_empty(cpus) ){ + int r; + r = insert_pending_ipi(my_cpu, *cpus, kvm_try_tasklet_schedule, NULL, 1); + if (r) { + dump_traces(NULL); + BUG_ON(1); + } + wake_up(&per_cpu(schedule_data, my_cpu).ipi_wq); + } + free_cpumask_var(cpus); + put_cpu(); +} + +static int +csched_pcpu_init(int cpu) +{ + struct csched_pcpu *spc; + unsigned long flags; + printk("cpu %d exec func %s\n", cpu, __FUNCTION__); + + /* Allocate per-PCPU info */ + spc = kmalloc(sizeof(struct csched_pcpu), GFP_KERNEL); + if ( spc == NULL ) + return -1; + + spin_lock_irqsave(&csched_priv.lock, flags); + + /* Initialize/update system-wide config */ + csched_priv.credit += CSCHED_CREDITS_PER_ACCT; + if ( csched_priv.ncpus <= cpu ) + csched_priv.ncpus = cpu + 1; + if ( csched_priv.master >= csched_priv.ncpus ) + csched_priv.master = cpu; + + spc->cpu = cpu; + hrtimer_init(&spc->ticker, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + spc->ticker.function = csched_tick_stub; + // hrtimer_data_pointer(&spc->ticker); + + INIT_LIST_HEAD(&spc->runq); + spc->runq_sort_last = csched_priv.runq_sort; + per_cpu(schedule_data, cpu).sched_priv = spc; + + spc->in_use = false; + + /* Start off idling... */ + BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr)); + cpumask_set_cpu(cpu, &csched_priv.idlers); + + spin_unlock_irqrestore(&csched_priv.lock, flags); + return 0; +} + +#ifndef NDEBUG +static inline void +__csched_vcpu_check(struct kvm_vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + struct csched_vm * const sdom = svc->sdom; + + BUG_ON( svc->vcpu != vc ); + BUG_ON( sdom != CSCHED_VM(vc->kvm) ); + if ( sdom ) + { + BUG_ON( is_idle_vcpu(vc) ); + BUG_ON( sdom->vm != vc->kvm); + } + else + { + BUG_ON( !is_idle_vcpu(vc) ); + } + + CSCHED_STAT_CRANK(vcpu_check); +} +#define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc)) +#else +#define CSCHED_VCPU_CHECK(_vc) +#endif + +static inline int +__csched_vcpu_is_migrateable(struct kvm_vcpu *vc, int dest_cpu) +{ + /* + * Don't pick up work that's in the peer's scheduling tail. Also only pick + * up work that's allowed to run on our CPU. + */ + + if(!per_cpu(schedule_data, raw_smp_processor_id()).can_migrate) return 0; + return !vc->is_running && cpu_isset(dest_cpu, vc->cpu_affinity); +} + +static int +csched_cpu_pick(struct kvm_vcpu *vc) +{ + cpumask_t cpus; + cpumask_t idlers; + int cpu; + + /* + * Pick from online CPUs in VCPU's affinity mask, giving a + * preference to its current processor if it's in there. + */ + cpus_and(cpus, cpu_online_map, vc->cpu_affinity); + cpu = cpu_isset(vc->processor, cpus) + ? vc->processor + : __cycle_cpu(vc->processor, &cpus); + ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) ); + + /* + * Try to find an idle processor within the above constraints. + * + * In multi-core and multi-threaded CPUs, not all idle execution + * vehicles are equal! + * + * We give preference to the idle execution vehicle with the most + * idling neighbours in its grouping. This distributes work across + * distinct cores first and guarantees we don't do something stupid + * like run two VCPUs on co-hyperthreads while there are idle cores + * or sockets. + */ + idlers = csched_priv.idlers; + cpu_set(cpu, idlers); + cpus_and(cpus, cpus, idlers); + cpu_clear(cpu, cpus); + + while ( !cpus_empty(cpus) ) + { + cpumask_t cpu_idlers; + cpumask_t nxt_idlers; + int nxt; + + nxt = __cycle_cpu(cpu, &cpus); + + if ( cpu_isset(cpu, per_cpu(cpu_core_map,nxt)) ) + { + ASSERT( cpu_isset(nxt, per_cpu(cpu_core_map,cpu)) ); + cpus_and(cpu_idlers, idlers, per_cpu(cpu_sibling_map,cpu)); + cpus_and(nxt_idlers, idlers, per_cpu(cpu_sibling_map,nxt)); + } + else + { + ASSERT( !cpu_isset(nxt, per_cpu(cpu_core_map,cpu)) ); + cpus_and(cpu_idlers, idlers, per_cpu(cpu_core_map,cpu)); + cpus_and(nxt_idlers, idlers, per_cpu(cpu_core_map,nxt)); + } + + if ( cpus_weight(cpu_idlers) < cpus_weight(nxt_idlers) ) + { + cpu = nxt; + cpu_clear(cpu, cpus); + } + else + { + cpus_andnot(cpus, cpus, nxt_idlers); + } + } + + return cpu; +} + +static inline void +__csched_vcpu_acct_start(struct csched_vcpu *svc) +{ + struct csched_vm * const sdom = svc->sdom; + unsigned long flags; + + spin_lock_irqsave(&csched_priv.lock, flags); + + if ( list_empty(&svc->active_vcpu_elem) ) + { + CSCHED_VCPU_STAT_CRANK(svc, state_active); + CSCHED_STAT_CRANK(acct_vcpu_active); + + sdom->active_vcpu_count++; + list_add(&svc->active_vcpu_elem, &sdom->active_vcpu); + if ( list_empty(&sdom->active_sdom_elem) ) + { + list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom); + csched_priv.weight += sdom->weight; + } + } + + spin_unlock_irqrestore(&csched_priv.lock, flags); +} + +static inline void +__csched_vcpu_acct_stop_locked(struct csched_vcpu *svc) +{ + struct csched_vm * const sdom = svc->sdom; + + BUG_ON( list_empty(&svc->active_vcpu_elem) ); + + CSCHED_VCPU_STAT_CRANK(svc, state_idle); + CSCHED_STAT_CRANK(acct_vcpu_idle); + + sdom->active_vcpu_count--; + list_del_init(&svc->active_vcpu_elem); + if ( list_empty(&sdom->active_vcpu) ) + { + BUG_ON( csched_priv.weight < sdom->weight ); + list_del_init(&sdom->active_sdom_elem); + csched_priv.weight -= sdom->weight; + } +} + +static void +csched_vcpu_acct(unsigned int cpu) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(current_vcpu); + + if ( current_vcpu->processor != cpu ) { + if(is_host_vcpu(current_vcpu)) printk("host_vcpu runing "); + if(is_idle_vcpu(current_vcpu)) printk("idle_vcpu runing "); + printk(" on cpu %d me is cpu %d\n", current_vcpu->processor, cpu); + BUG_ON(1); + } + ASSERT( svc->sdom != NULL ); + + /* + * If this VCPU's priority was boosted when it last awoke, reset it. + * If the VCPU is found here, then it's consuming a non-negligeable + * amount of CPU resources and should no longer be boosted. + */ + if ( svc->pri == CSCHED_PRI_TS_BOOST ) + svc->pri = CSCHED_PRI_TS_UNDER; + + /* + * Update credits + */ + atomic_sub(CSCHED_CREDITS_PER_TICK, &svc->credit); + + /* + * Put this VCPU and domain back on the active list if it was + * idling. + * + * If it's been active a while, check if we'd be better off + * migrating it to run elsewhere (see multi-core and multi-thread + * support in csched_cpu_pick()). + */ + if ( list_empty(&svc->active_vcpu_elem) ) + { + __csched_vcpu_acct_start(svc); + } + else if ( csched_cpu_pick(current_vcpu) != cpu ) + { + CSCHED_VCPU_STAT_CRANK(svc, migrate_r); + CSCHED_STAT_CRANK(migrate_running); + set_bit(_VPF_migrating, &current_vcpu->pause_flags); + kvm_try_tasklet_schedule(NULL); + } +} + +static int +csched_vcpu_init(struct kvm_vcpu *vc) +{ + struct kvm* const kvm= vc->kvm; + struct csched_vm *sdom = CSCHED_VM(kvm); + struct csched_vcpu *svc; + + + CSCHED_STAT_CRANK(vcpu_init); + + /* Allocate per-VCPU info */ + svc = kmalloc(sizeof(struct csched_vcpu), GFP_KERNEL); + if ( svc == NULL ) + return -1; + + INIT_LIST_HEAD(&svc->runq_elem); + INIT_LIST_HEAD(&svc->active_vcpu_elem); + svc->sdom = sdom; + svc->vcpu = vc; + atomic_set(&svc->credit, 0); + svc->flags = 0U; + svc->pri = is_idle_vm(kvm) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER; + CSCHED_VCPU_STATS_RESET(svc); + vc->sched_priv = svc; + + /* Allocate per-PCPU info */ + if ( unlikely(!CSCHED_PCPU(vc->processor)) ) + { + if ( csched_pcpu_init(vc->processor) != 0 ) + return -1; + } + + CSCHED_VCPU_CHECK(vc); + return 0; +} + +static void +csched_vcpu_destroy(struct kvm_vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + struct csched_vm * const sdom = svc->sdom; + unsigned long flags; + + CSCHED_STAT_CRANK(vcpu_destroy); + + if(is_idle_vcpu(vc) || is_host_vcpu(vc)) { + kfree(svc); + return; + } + + BUG_ON( sdom == NULL ); + BUG_ON( !list_empty(&svc->runq_elem) ); + + spin_lock_irqsave(&csched_priv.lock, flags); + + if ( !list_empty(&svc->active_vcpu_elem) ) + __csched_vcpu_acct_stop_locked(svc); + + spin_unlock_irqrestore(&csched_priv.lock, flags); + + kfree(svc); +} + +static void +csched_vcpu_sleep(struct kvm_vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + int my_cpu = get_cpu(); + + CSCHED_STAT_CRANK(vcpu_sleep); + + BUG_ON( is_idle_vcpu(vc) ); + + if ( per_cpu(schedule_data, vc->processor).curr == vc ){ + if(vc->processor== my_cpu) + kvm_try_tasklet_schedule(NULL); + else{ + int r; + r = insert_pending_ipi(my_cpu, cpumask_of_cpu(vc->processor), kvm_try_tasklet_schedule, NULL, 1); + if(r) { + dump_traces(NULL); + BUG_ON(1); + } + wake_up(&per_cpu(schedule_data, my_cpu).ipi_wq); + } + } + else if ( __vcpu_on_runq(svc) ) + __runq_remove(svc); + put_cpu(); +} + +static void +csched_vcpu_wake(struct kvm_vcpu *vc) +{ + struct csched_vcpu * const svc = CSCHED_VCPU(vc); + const unsigned int cpu = vc->processor; + BUG_ON( is_idle_vcpu(vc) ); + TRACE_2D(TRC_CSCHED_WAKE, __LINE__, (unsigned long)vc); + + if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) ) + { + CSCHED_STAT_CRANK(vcpu_wake_running); + return; + } + if ( unlikely(__vcpu_on_runq(svc)) ) + { + CSCHED_STAT_CRANK(vcpu_wake_onrunq); + return; + } + + if ( likely(vcpu_runnable(vc)) ) + CSCHED_STAT_CRANK(vcpu_wake_runnable); + else + CSCHED_STAT_CRANK(vcpu_wake_not_runnable); + + /* + * We temporarly boost the priority of awaking VCPUs! + * + * If this VCPU consumes a non negligeable amount of CPU, it + * will eventually find itself in the credit accounting code + * path where its priority will be reset to normal. + * + * If on the other hand the VCPU consumes little CPU and is + * blocking and awoken a lot (doing I/O for example), its + * priority will remain boosted, optimizing it's wake-to-run + * latencies. + * + * This allows wake-to-run latency sensitive VCPUs to preempt + * more CPU resource intensive VCPUs without impacting overall + * system fairness. + * + * The one exception is for VCPUs of capped domains unpausing + * after earning credits they had overspent. We don't boost + * those. + */ + if ( svc->pri == CSCHED_PRI_TS_UNDER && + !(svc->flags & CSCHED_FLAG_VCPU_PARKED) ) + { + svc->pri = CSCHED_PRI_TS_BOOST; + } + + /* Put the VCPU on the runq and tickle CPUs */ + TRACE_2D(TRC_CSCHED_WAKE, __LINE__, vc); + __runq_insert(cpu, svc); + TRACE_2D(TRC_CSCHED_WAKE, __LINE__, vc); + __runq_tickle(cpu, svc); + TRACE_2D(TRC_CSCHED_WAKE, __LINE__, vc); +} +static void get_param(char* param, char **endp, int *weight, int *cap) +{ + char *opt_1; + char *opt_2; + /* skip heading spaces */ + while((*param == ' ') || (*param == '\t') || (*param == '\n')) + param++; + + opt_1 = strchr(param, '='); + if (opt_1 != NULL){ + char *end = opt_1-1; + *opt_1++ = '\0'; + while((*end == ' ') || (*end == '\t') || (*end == '\n')) + *end-- = '\0'; + } + if(strcmp(param, "weight") == 0) + *weight = simple_strtol(opt_1, endp, 0); + else if(strcmp(param, "cap") == 0) + *cap = simple_strtol(opt_1, endp, 0); +} +static int csched_read_param(struct kvm *kvm, char *buf, int size) +{ + struct csched_vm * const sdom = CSCHED_VM(kvm); + unsigned long flags; + int r; + + r = snprintf(buf, size, "Credit Schedule parameters for VM %d\n" + "Weight: %d Cap: %d\n", kvm->vmid, sdom->weight, sdom->cap); + if (r < 0) printk("the buf for sched_read_param is too small!\n"); + return (r>0); +} +static int csched_write_param(struct kvm *kvm, char *param) +{ + char *tmp; + int weight = 256; + int cap = 0; + struct csched_vm * const sdom = CSCHED_VM(kvm); + unsigned long flags; + + get_param(param, &tmp, &weight, &cap); + get_param(tmp, &tmp, &weight, &cap); + spin_lock_irqsave(&csched_priv.lock, flags); + + if ( !list_empty(&sdom->active_sdom_elem) ) + { + csched_priv.weight -= sdom->weight; + csched_priv.weight += weight; + } + sdom->weight = weight; + sdom->cap = cap; + spin_unlock_irqrestore(&csched_priv.lock, flags); + printk("the weight is %d cap is %d\n", weight, cap); + return 1; +} + +static int +csched_vm_init(struct kvm *kvm) +{ + struct csched_vm *sdom; + + CSCHED_STAT_CRANK(dom_init); + if ( is_idle_vm(kvm) ) + return 0; + + sdom = kmalloc(sizeof(struct csched_vm), GFP_KERNEL); + if ( sdom == NULL ) + return -ENOMEM; + + /* Initialize credit and weight */ + INIT_LIST_HEAD(&sdom->active_vcpu); + sdom->active_vcpu_count = 0; + INIT_LIST_HEAD(&sdom->active_sdom_elem); + sdom->vm = kvm; + sdom->weight = CSCHED_DEFAULT_WEIGHT; + sdom->cap = 0U; + kvm->sched_priv = sdom; + + return 0; +} + +static void +csched_vm_destroy(struct kvm *kvm) +{ + CSCHED_STAT_CRANK(dom_destroy); + kfree(CSCHED_VM(kvm)); +} + +/* + * This is a O(n) optimized sort of the runq. + * + * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk + * through the runq and move up any UNDERs that are preceded by OVERS. We + * remember the last UNDER to make the move up operation O(1). + */ +static void +csched_runq_sort(unsigned int cpu) +{ + struct csched_pcpu * const spc = CSCHED_PCPU(cpu); + struct list_head *runq, *elem, *next, *last_under; + struct csched_vcpu *svc_elem; + unsigned long flags; + int sort_epoch; + + sort_epoch = csched_priv.runq_sort; + if ( sort_epoch == spc->runq_sort_last ) + return; + + spc->runq_sort_last = sort_epoch; + + spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags); + + runq = &spc->runq; + elem = runq->next; + last_under = runq; + + while ( elem != runq ) + { + next = elem->next; + svc_elem = __runq_elem(elem); + + if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER ) + { + /* does elem need to move up the runq? */ + if ( elem->prev != last_under ) + { + list_del(elem); + list_add(elem, last_under); + } + last_under = elem; + } + + elem = next; + } + spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags); +} + +static void +csched_acct(void) +{ + unsigned long flags; + struct list_head *iter_vcpu, *next_vcpu; + struct list_head *iter_sdom, *next_sdom; + struct csched_vcpu *svc; + struct csched_vm *sdom; + uint32_t credit_total; + uint32_t weight_total; + uint32_t weight_left; + uint32_t credit_fair; + uint32_t credit_peak; + uint32_t credit_cap; + int credit_balance; + int credit_xtra; + int credit; + + + spin_lock_irqsave(&csched_priv.lock, flags); + + weight_total = csched_priv.weight; + credit_total = csched_priv.credit; + + /* Converge balance towards 0 when it drops negative */ + if ( csched_priv.credit_balance < 0 ) + { + credit_total -= csched_priv.credit_balance; + CSCHED_STAT_CRANK(acct_balance); + } + + if ( unlikely(weight_total == 0) ) + { + csched_priv.credit_balance = 0; + spin_unlock_irqrestore(&csched_priv.lock, flags); + CSCHED_STAT_CRANK(acct_no_work); + return; + } + + CSCHED_STAT_CRANK(acct_run); + + weight_left = weight_total; + credit_balance = 0; + credit_xtra = 0; + credit_cap = 0U; + + list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom ) + { + sdom = list_entry(iter_sdom, struct csched_vm, active_sdom_elem); + + BUG_ON( is_idle_vm(sdom->vm) ); + BUG_ON( sdom->active_vcpu_count == 0 ); + BUG_ON( sdom->weight == 0 ); + BUG_ON( sdom->weight > weight_left ); + + weight_left -= sdom->weight; + + /* + * A domain's fair share is computed using its weight in competition + * with that of all other active domains. + * + * At most, a domain can use credits to run all its active VCPUs + * for one full accounting period. We allow a domain to earn more + * only when the system-wide credit balance is negative. + */ + credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT; + if ( csched_priv.credit_balance < 0 ) + { + credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) + + (weight_total - 1) + ) / weight_total; + } + + if ( sdom->cap != 0U ) + { + credit_cap = ((sdom->cap * CSCHED_CREDITS_PER_ACCT) + 99) / 100; + if ( credit_cap < credit_peak ) + credit_peak = credit_cap; + + credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 ) + ) / sdom->active_vcpu_count; + } + + credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1) + ) / weight_total; + + if ( credit_fair < credit_peak ) + { + credit_xtra = 1; + } + else + { + if ( weight_left != 0U ) + { + /* Give other domains a chance at unused credits */ + credit_total += ( ( ( credit_fair - credit_peak + ) * weight_total + ) + ( weight_left - 1 ) + ) / weight_left; + } + + if ( credit_xtra ) + { + /* + * Lazily keep domains with extra credits at the head of + * the queue to give others a chance at them in future + * accounting periods. + */ + CSCHED_STAT_CRANK(acct_reorder); + list_del(&sdom->active_sdom_elem); + list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom); + } + + credit_fair = credit_peak; + } + + /* Compute fair share per VCPU */ + credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 ) + ) / sdom->active_vcpu_count; + + + list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu ) + { + svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem); + BUG_ON( sdom != svc->sdom ); + + /* Increment credit */ + atomic_add(credit_fair, &svc->credit); + credit = atomic_read(&svc->credit); + + /* + * Recompute priority or, if VCPU is idling, remove it from + * the active list. + */ + if ( credit < 0 ) + { + svc->pri = CSCHED_PRI_TS_OVER; + + /* Park running VCPUs of capped-out domains */ + if ( sdom->cap != 0U && + credit < -credit_cap && + !(svc->flags & CSCHED_FLAG_VCPU_PARKED) ) + { + CSCHED_STAT_CRANK(vcpu_park); + vcpu_pause_nosync(svc->vcpu); + svc->flags |= CSCHED_FLAG_VCPU_PARKED; + } + + /* Lower bound on credits */ + if ( credit < -CSCHED_CREDITS_PER_TSLICE ) + { + CSCHED_STAT_CRANK(acct_min_credit); + credit = -CSCHED_CREDITS_PER_TSLICE; + atomic_set(&svc->credit, credit); + } + } + else + { + svc->pri = CSCHED_PRI_TS_UNDER; + + /* Unpark any capped domains whose credits go positive */ + if ( svc->flags & CSCHED_FLAG_VCPU_PARKED) + { + /* + * It's important to unset the flag AFTER the unpause() + * call to make sure the VCPU's priority is not boosted + * if it is woken up here. + */ + CSCHED_STAT_CRANK(vcpu_unpark); + vcpu_unpause(svc->vcpu); + svc->flags &= ~CSCHED_FLAG_VCPU_PARKED; + } + + /* Upper bound on credits means VCPU stops earning */ + if ( credit > CSCHED_CREDITS_PER_TSLICE ) + { + __csched_vcpu_acct_stop_locked(svc); + credit = 0; + atomic_set(&svc->credit, credit); + } + } + + CSCHED_VCPU_STAT_SET(svc, credit_last, credit); + CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair); + credit_balance += credit; + } + } + + csched_priv.credit_balance = credit_balance; + + spin_unlock_irqrestore(&csched_priv.lock, flags); + + /* Inform each CPU that its runq needs to be sorted */ + csched_priv.runq_sort++; +} + +/* the return value indicates whether this routine has enabled the pseudo_irq + * Return Value: + * 0 + */ +static void _csched_tick(unsigned long _unused) +{ + int my_cpu = get_cpu(); + struct csched_pcpu *spc = CSCHED_PCPU(my_cpu); + unsigned int cpu = spc->cpu; + ktime_t now; + + /* if we fail to lock, it indicates the KVM modules are going to + * be unloaded + */ + if(cmpxchg(&spc->in_use, false, true) != false) { + put_cpu(); + return; + } + + BUG_ON(thread_preemptible()); + BUG_ON(cpu != my_cpu); + + TRACE_2D(TRC_INTERNAL_CSCHED_TICK, my_cpu, __LINE__); + // tasklet_disable(&per_cpu(schedule_data, my_cpu).sched_tasklet); +try_again: + while(per_cpu(schedule_data, my_cpu).sched_state == SCHEDULER_USER) + ; + if(per_cpu(schedule_data, my_cpu).sched_state == SCHEDULER_KERNEL) { + /* this indicates that someone else has entered the critical region now + * we should not continue + */ + TRACE_2D(TRC_INTERNAL_CSCHED_TICK, my_cpu, __LINE__); + if(!shutting_down) { + hrtimer_cancel(&spc->ticker); + now = spc->ticker.base->get_time(); + hrtimer_start(&spc->ticker, + ktime_add_ns(now, 10000), + HRTIMER_MODE_ABS); + } + // tasklet_enable(&per_cpu(schedule_data, my_cpu).sched_tasklet); + spc->in_use = false; + put_cpu(); + return; + } + if (cmpxchg(&per_cpu(schedule_data, my_cpu).sched_state, + SCHEDULER_FREE, SCHEDULER_KERNEL) != SCHEDULER_FREE) + goto try_again; + + TRACE_2D(TRC_INTERNAL_CSCHED_TICK, my_cpu, __LINE__); + + spc->tick++; + + /* + * Accounting for running VCPU + */ + if ( !is_idle_vcpu(current_vcpu) ) + csched_vcpu_acct(cpu); + + TRACE_2D(TRC_INTERNAL_CSCHED_TICK, my_cpu, __LINE__); + /* + * Host-wide accounting duty + * + * Note: Currently, this is always done by the master boot CPU. Eventually, + * we could distribute or at the very least cycle the duty. + */ + if ( (csched_priv.master == cpu) && + (spc->tick % CSCHED_TICKS_PER_ACCT) == 0 ) + { + csched_acct(); + } + + TRACE_2D(TRC_INTERNAL_CSCHED_TICK, my_cpu, __LINE__); + /* + * Check if runq needs to be sorted + * + * Every physical CPU resorts the runq after the accounting master has + * modified priorities. This is a special O(n) sort and runs at most + * once per accounting period (currently 30 milliseconds). + */ + csched_runq_sort(cpu); + TRACE_2D(TRC_INTERNAL_CSCHED_TICK, my_cpu, __LINE__); + + if (!shutting_down) { + now = spc->ticker.base->get_time(); + hrtimer_start(&spc->ticker, + ktime_add_ns(now, MILLISECS(CSCHED_MSECS_PER_TICK)), + HRTIMER_MODE_ABS); + } + per_cpu(schedule_data, my_cpu).sched_state = SCHEDULER_FREE; + // tasklet_enable(&per_cpu(schedule_data, my_cpu).sched_tasklet); + TRACE_2D(TRC_INTERNAL_CSCHED_TICK, my_cpu, __LINE__); + spc->in_use = false; + put_cpu(); +} + +/* NOTICE: + * Linux can not guarrentee the timer set on one cpu will be destined + * to be triggered on that cpu. That is to say: one cpu might receive + * other cpu's timer expire message. In this case, the message should + * be relayed to the corresponding cpu. This is done via an IPI-tasklet + * --Alex + */ +static enum hrtimer_restart csched_tick_stub(struct hrtimer *data) +{ + u64 delta; + int cur_cpu = get_cpu(); + struct csched_pcpu *spc = container_of(data, struct csched_pcpu, ticker); + BUG_ON(!data); + BUG_ON(!spc); + + /* if the ticker does not belong to this cpu, relay it */ + if(spc->cpu != cur_cpu){ + int r; + r = insert_pending_ipi(cur_cpu, cpumask_of_cpu(spc->cpu), csched_tick_stub, (void*)data, 1); + if (r) { + dump_traces(NULL); + BUG_ON(1); + } + TRACE_3D(TRC_CSCHED_TICK,cur_cpu, spc->cpu, __LINE__); + wake_up(&per_cpu(schedule_data, cur_cpu).ipi_wq); + }else { + TRACE_3D(TRC_CSCHED_TICK,cur_cpu, spc->cpu, __LINE__); + tasklet_schedule(&per_cpu(schedule_data, cur_cpu).tick_tasklet); + } + + /* the timer will be restarted by the tick_tasklet */ + /* to add hrtimer_cancel() here!*/ + put_cpu(); + TRACE_2D(TRC_CSCHED_TICK, cur_cpu, __LINE__); + return HRTIMER_NORESTART; +} + +static struct csched_vcpu * +csched_runq_steal(int peer_cpu, int cpu, int pri) +{ + const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu); + const struct kvm_vcpu * const peer_vcpu = per_cpu(schedule_data, peer_cpu).curr; + struct csched_vcpu *speer; + struct list_head *iter; + struct kvm_vcpu *vc; + + /* + * Don't steal from an idle CPU's runq because it's about to + * pick up work from it itself. + */ + if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) ) + { + list_for_each( iter, &peer_pcpu->runq ) + { + speer = __runq_elem(iter); + + /* + * If next available VCPU here is not of strictly higher + * priority than ours, this PCPU is useless to us. + */ + if ( speer->pri <= pri ) + break; + + /* Is this VCPU is runnable on our PCPU? */ + vc = speer->vcpu; + BUG_ON( is_idle_vcpu(vc) ); + + if (__csched_vcpu_is_migrateable(vc, cpu)) + { + /* We got a candidate. Grab it! */ + CSCHED_VCPU_STAT_CRANK(speer, migrate_q); + CSCHED_STAT_CRANK(migrate_queued); + __runq_remove(speer); + printk("setting thread %d's cpuaffinity to %d!\n", + vc->thread->pid, cpu); + kvm_sched_setaffinity(vc->thread->pid, cpumask_of_cpu(cpu)); + vc->processor= cpu; + return speer; + } + } + } + + CSCHED_STAT_CRANK(steal_peer_idle); + return NULL; +} + +static struct csched_vcpu * +csched_load_balance(int cpu, struct csched_vcpu *snext) +{ + struct csched_vcpu *speer; + cpumask_t workers; + int peer_cpu; + + BUG_ON( cpu != snext->vcpu->processor); + + if ( snext->pri == CSCHED_PRI_IDLE ) + CSCHED_STAT_CRANK(load_balance_idle); + else if ( snext->pri == CSCHED_PRI_TS_OVER ) + CSCHED_STAT_CRANK(load_balance_over); + else + CSCHED_STAT_CRANK(load_balance_other); + + /* + * Peek at non-idling CPUs in the system, starting with our + * immediate neighbour. + */ + cpus_andnot(workers, cpu_online_map, csched_priv.idlers); + cpu_clear(cpu, workers); + peer_cpu = cpu; + + while ( !cpus_empty(workers) ) + { + peer_cpu = __cycle_cpu(peer_cpu, &workers); + cpu_clear(peer_cpu, workers); + + /* + * Get ahold of the scheduler lock for this peer CPU. + * + * Note: We don't spin on this lock but simply try it. Spinning could + * cause a deadlock if the peer CPU is also load balancing and trying + * to lock this CPU. + */ + if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) ) + { + CSCHED_STAT_CRANK(steal_trylock_failed); + continue; + } + TRACE_3D(TRC_LOAD_BALANCE, peer_cpu, raw_smp_processor_id(), __LINE__); + + + /* + * Any work over there to steal? + */ + speer = csched_runq_steal(peer_cpu, cpu, snext->pri); + spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock); + TRACE_3D(TRC_LOAD_BALANCE, cpu, peer_cpu, __LINE__); + if ( speer != NULL ) + return speer; + } + + TRACE_3D(TRC_LOAD_BALANCE, cpu, peer_cpu, __LINE__); + /* Failed to find more important work elsewhere... */ + __runq_remove(snext); + TRACE_3D(TRC_LOAD_BALANCE, cpu, peer_cpu, __LINE__); + return snext; +} + +/* + * This function is in the critical path. It is designed to be simple and + * fast for the common case. + */ +static struct task_slice +csched_schedule(s_time_t now) +{ + const int cpu = smp_processor_id(); + struct list_head * const runq = RUNQ(cpu); + struct csched_vcpu * const scurr = CSCHED_VCPU(current_vcpu); + struct csched_vcpu *snext; + struct task_slice ret; + + if(shutting_down) { + ret.time = MILLISECS(CSCHED_MSECS_PER_TSLICE); + ret.task = idle_vm_kvm->vcpus[cpu]; + return ret; + } + + CSCHED_STAT_CRANK(schedule); + CSCHED_VCPU_CHECK(current_vcpu); + + /* + * Select next runnable local VCPU (ie top of local runq) + */ + if ( vcpu_runnable(current_vcpu) ) + __runq_insert(cpu, scurr); + else + BUG_ON( is_idle_vcpu(current_vcpu) || list_empty(runq) ); + + snext = __runq_elem(runq->next); + + /* + * SMP Load balance: + * + * If the next highest priority local runnable VCPU has already eaten + * through its credits, look on other PCPUs to see if we have more + * urgent work... If not, csched_load_balance() will return snext, but + * already removed from the runq. + */ + if ( snext->pri > CSCHED_PRI_TS_OVER ) + __runq_remove(snext); + else + snext = csched_load_balance(cpu, snext); + TRACE_3D(TRC_CSCHED_SCHEDULE, cpu, snext, __LINE__); + + /* + * Update idlers mask if necessary. When we're idling, other CPUs + * will tickle us when they get extra work. + */ + if ( snext->pri == CSCHED_PRI_IDLE ) + { + if ( !cpu_isset(cpu, csched_priv.idlers) ) + cpu_set(cpu, csched_priv.idlers); + } + else if ( cpu_isset(cpu, csched_priv.idlers) ) + { + cpu_clear(cpu, csched_priv.idlers); + } + + /* + * Return task to run next... + */ + ret.time = MILLISECS(CSCHED_MSECS_PER_TSLICE); + ret.task = snext->vcpu; + + CSCHED_VCPU_CHECK(ret.task); + return ret; +} + +static void +csched_dump_vcpu(struct csched_vcpu *svc) +{ + printk("func %s unimplemented\n", __FUNCTION__); +} + +static void +csched_dump_pcpu(int cpu) +{ + struct list_head *runq, *iter; + struct csched_pcpu *spc; + struct csched_vcpu *svc; + int loop; + + spc = CSCHED_PCPU(cpu); + runq = &spc->runq; + + printk(" sort=%d, sibling=0x%lx, core=0x%lx\n", + spc->runq_sort_last, + per_cpu(cpu_sibling_map,cpu).bits[0], + per_cpu(cpu_core_map,cpu).bits[0]); + + /* current VCPU */ + svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); + if ( svc ) + { + printk("\trun: "); + csched_dump_vcpu(svc); + } + + loop = 0; + list_for_each( iter, runq ) + { + svc = __runq_elem(iter); + if ( svc ) + { + printk("\t%3d: ", ++loop); + csched_dump_vcpu(svc); + } + } +} + +static void +csched_dump(void) +{ + struct list_head *iter_sdom, *iter_svc; + int loop; + + printk("info:\n" + "\tncpus = %u\n" + "\tmaster = %u\n" + "\tcredit = %u\n" + "\tcredit balance = %d\n" + "\tweight = %u\n" + "\trunq_sort = %u\n" + "\tdefault-weight = %d\n" + "\tmsecs per tick = %dms\n" + "\tcredits per tick = %d\n" + "\tticks per tslice = %d\n" + "\tticks per acct = %d\n", + csched_priv.ncpus, + csched_priv.master, + csched_priv.credit, + csched_priv.credit_balance, + csched_priv.weight, + csched_priv.runq_sort, + CSCHED_DEFAULT_WEIGHT, + CSCHED_MSECS_PER_TICK, + CSCHED_CREDITS_PER_TICK, + CSCHED_TICKS_PER_TSLICE, + CSCHED_TICKS_PER_ACCT); + + printk("idlers: 0x%lx\n", csched_priv.idlers.bits[0]); + + CSCHED_STATS_PRINTK(); + + printk("active vcpus:\n"); + loop = 0; + list_for_each( iter_sdom, &csched_priv.active_sdom ) + { + struct csched_vm *sdom; + sdom = list_entry(iter_sdom, struct csched_vm, active_sdom_elem); + + list_for_each( iter_svc, &sdom->active_vcpu ) + { + struct csched_vcpu *svc; + svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem); + + printk("\t%3d: ", ++loop); + csched_dump_vcpu(svc); + } + } +} +static void csched_stop_schedule(int cpu) +{ + struct csched_pcpu *spc; + spc = CSCHED_PCPU(cpu); + if(!spc) { + printk("func %s line %d cpu %d\n", + __FUNCTION__, __LINE__, cpu); + BUG_ON(1); + } + hrtimer_cancel(&spc->ticker); +} + + +/* Tickers cannot be kicked until SMP subsystem is alive. */ +int csched_start(int cpu) +{ + struct csched_pcpu *spc; + ktime_t now; + + /* Is the credit scheduler initialised? */ + if ( csched_priv.ncpus == 0 ){ + printk("(ERROR) You should not see func %s line %d\n", + __FUNCTION__, __LINE__); + BUG_ON(1); + return 1; + } + + tasklet_init(&per_cpu(schedule_data, cpu).tick_tasklet, _csched_tick, (unsigned long)NULL); + spc = CSCHED_PCPU(cpu); + if(!spc) { + printk("func %s line %d cpu %d\n", + __FUNCTION__, __LINE__, cpu); + BUG_ON(1); + return 1; + } + + now = spc->ticker.base->get_time(); + + hrtimer_start(&spc->ticker, + ktime_add_ns(now, MILLISECS(CSCHED_MSECS_PER_TICK)), + HRTIMER_MODE_ABS); + + return 0; +} +static void +csched_init(void) +{ + spin_lock_init(&csched_priv.lock); + INIT_LIST_HEAD(&csched_priv.active_sdom); + csched_priv.ncpus = 0; + csched_priv.master = UINT_MAX; + cpus_clear(csched_priv.idlers); + csched_priv.weight = 0U; + csched_priv.credit = 0U; + csched_priv.credit_balance = 0; + csched_priv.runq_sort = 0U; + CSCHED_STATS_RESET(); +} + +#define SCHEDULER_CREDIT 1 +struct scheduler sched_credit_def = { + .name = "SMP Credit Scheduler", + .opt_name = "credit", + .sched_id = SCHEDULER_CREDIT, + + .init_vm = csched_vm_init, + .destroy_vm = csched_vm_destroy, + + .init_vcpu = csched_vcpu_init, + .destroy_vcpu = csched_vcpu_destroy, + + .sleep = csched_vcpu_sleep, + .wake = csched_vcpu_wake, + + .read_schedule_info = csched_read_param, + .write_schedule_info = csched_write_param, + + .pick_cpu = csched_cpu_pick, + .do_schedule = csched_schedule, + + .disable_scheduler = csched_disable, + .start_scheduler = csched_start, + .stop_schedule = csched_stop_schedule, + + .dump_cpu_state = csched_dump_pcpu, + .dump_settings = csched_dump, + .init = csched_init, +}; diff --git a/arch/x86/kvm/schedule.c b/arch/x86/kvm/schedule.c new file mode 100644 index 0000000..d6938d5 --- /dev/null +++ b/arch/x86/kvm/schedule.c @@ -0,0 +1,959 @@ +/**************************************************************************** + * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge + * (C) 2002-2003 University of Cambridge + * (C) 2004 - Mark Williamson - Intel Research Cambridge + * (C) 2009 - Xiaojian Liu + **************************************************************************** + * + * File: common/schedule.c + * Author: Rolf Neugebauer & Keir Fraser + * Updated for generic API by Mark Williamson + * Updated for KVM by Xiaojian Liu + * + * Description: Generic CPU scheduling code + * implements support functionality for the Xen scheduler API. + * + */ +#include <linux/kvm_host.h> +#include <linux/spinlock.h> +#include <linux/trace.h> +#include <linux/sched-if.h> +#include <linux/ipi.h> +#include <linux/proc_fs.h> +#include <linux/uaccess.h> +#include <linux/kthread.h> + +#ifdef DEBUG +#define ASSERT(x) \ +do { \ + if (!(x)) { \ + printk(KERN_EMERG "assertion failed %s: %d: %s\n", \ + __FILE__, __LINE__, #x); \ + BUG(); \ + } \ +} while (0) +#else +#define ASSERT(x) do { } while (0) +#endif + +#if 0 +long (*sched_setaffinity_p)(pid_t pid, cpumask_t new_mask); +#else +extern long (*sched_setaffinity_p)(pid_t pid, cpumask_t* in_mask); +#endif +// EXPORT_SYMBOL_GPL(sched_setaffinity_p); + +DEFINE_PER_CPU(struct schedule_data, schedule_data); +DEFINE_PER_CPU(rwlock_t, pseudo_cli); + +static struct list_head sched_vm_list; +static spinlock_t sched_vm_list_lock; + +#define KVM_SCHEDULER "kvm" + +struct proc_dir_entry *kvm_scheduler_root; + +#ifndef COMPAT + +/* Various timer handlers. */ +static enum hrtimer_restart s_timer_fn(struct hrtimer* timer); +static void vcpu_singleshot_timer_fn(void *data); +static void poll_timer_fn(void *data); + +/* This is global for now so that private implementations can reach it */ + +extern struct scheduler sched_credit_def; +static struct scheduler *schedulers[] = { + &sched_credit_def, + NULL +}; + +static struct scheduler ops; + +#define SCHED_OP(fn, ...) \ + (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ ) \ + : (typeof(ops.fn(__VA_ARGS__)))0 ) + +void vcpu_pause(struct kvm_vcpu *v) +{ + ASSERT(v != current_vcpu); + + atomic_inc(&v->pause_count); + vcpu_sleep_sync(v); +} +void vcpu_pause_nosync(struct kvm_vcpu *v) +{ + atomic_inc(&v->pause_count); + vcpu_sleep_nosync(v); +} +void vcpu_unpause(struct kvm_vcpu *v) +{ + if(atomic_dec_and_test(&v->pause_count)) + vcpu_wake(v); +} + +void vm_pause(struct kvm *kvm) +{ + int late_wait = -1; + int i; + + for(i=0; i< KVM_MAX_VCPUS; i++) { + if(kvm->vcpus[i]) { + if(current_vcpu == kvm->vcpus[i]) { + atomic_inc(&kvm->vcpus[i]->pause_count); + late_wait = i; + continue; + } + vcpu_sleep_sync(kvm->vcpus[i]); + } + } + if (late_wait != -1) { + if(current_vcpu == kvm->vcpus[late_wait]) { + if(!is_host_vm(kvm) && !is_idle_vm(kvm)) { + tasklet_schedule(&per_cpu(schedule_data, raw_smp_processor_id()).sched_tasklet); + while(current_vcpu->kvm == kvm) { + printk("waiting\n"); + schedule(); + } + } + } + } +} + +void vm_unpause(struct kvm *kvm) +{ + int i; + struct kvm_vcpu* v; + int cpu = get_cpu(); + + if ( atomic_dec_and_test(&kvm->pause_count) ){ + for(i = 0; i < KVM_MAX_VCPUS; ++i) { + if(unlikely(i == cpu)) continue; + v = kvm->vcpus[i]; + if (v) { + printk("me is %d waking vcpu %d\n", cpu, i); + vcpu_wake(v); + } + } + if(kvm->vcpus[cpu]){ + printk("waking myself %d now\n", cpu); + vcpu_wake(kvm->vcpus[cpu]); + } + } + put_cpu(); +} +void vm_pause_by_systemcontroller(struct kvm* kvm) +{ + vm_pause(kvm); + if(test_and_set_bool(kvm->is_paused_by_controller)) + vm_unpause(kvm); +} + +void vm_unpause_by_systemcontroller(struct kvm* kvm) +{ + if(test_and_clear_bool(kvm->is_paused_by_controller)) + vm_unpause(kvm); + +} + +static inline void vcpu_runstate_change( + struct kvm_vcpu *v, int new_state, s_time_t new_entry_time) +{ + ASSERT(v->runstate.state != new_state); + ASSERT(spin_is_locked(&per_cpu(schedule_data,v->processor).schedule_lock)); + + v->runstate.time[v->runstate.state] += + new_entry_time - v->runstate.state_entry_time; + v->runstate.state_entry_time = new_entry_time; + v->runstate.state = new_state; +} + +void vcpu_runstate_get(struct kvm_vcpu *v, struct vcpu_runstate_info *runstate) +{ + if ( likely(v == current_vcpu) ) + { + /* Fast lock-free path. */ + memcpy(runstate, &v->runstate, sizeof(*runstate)); + ASSERT(runstate->state == RUNSTATE_running); + runstate->time[RUNSTATE_running] += NOW() - runstate->state_entry_time; + } + else + { + unsigned long flags; + get_cpu(); + local_irq_save(flags); + vcpu_schedule_lock_irq(v); + memcpy(runstate, &v->runstate, sizeof(*runstate)); + runstate->time[runstate->state] += NOW() - runstate->state_entry_time; + vcpu_schedule_unlock_irq(v); + local_irq_restore(flags); + put_cpu(); + } +} + +int sched_init_vcpu(struct kvm_vcpu *v, unsigned int processor) +{ + int r; + struct kvm *kvm = v->kvm; + + /* + * Initialize processor and affinity settings. The idler, and potentially + * domain-0 VCPUs, are pinned onto their respective physical CPUs. + */ + v->processor = processor; + printk("sched_setaffinity_p is %p\n", sched_setaffinity_p); + if (!is_host_vcpu(v) && !is_idle_vcpu(v)) + kvm_sched_setaffinity(v->thread->pid, cpumask_of_cpu(processor)); + + if ( is_idle_vm(kvm) || is_host_vm(kvm)) + v->cpu_affinity = cpumask_of_cpu(processor); + else + cpus_setall(v->cpu_affinity); + + /* Idle VCPUs are scheduled immediately. */ + if ( is_idle_vm(kvm) ) + { + per_cpu(schedule_data, v->processor).curr = v; + per_cpu(schedule_data, v->processor).idle = v; + v->is_running = 1; + } + + TRACE_1D(TRC_SCHED_VM_ADD, v->vcpu_id); + r = SCHED_OP(init_vcpu, v); + return r; +} + +void sched_destroy_vcpu(struct kvm_vcpu *v) +{ + int cpu = get_cpu(); + SCHED_OP(destroy_vcpu, v); + put_cpu(); +} +struct kvm* get_kvm_by_id(int id) +{ + + struct kvm *kvm, *n; + spin_lock(&sched_vm_list_lock); + list_for_each_entry_safe(kvm, n, &sched_vm_list, vm_link) { + if (kvm->vmid == id) { + spin_unlock(&sched_vm_list_lock); + return kvm; + } + } + return NULL; +} +static inline struct kvm* get_proc_kvm(struct file *file) +{ + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + vmid_t id; + + if(strcmp(pde->name, "host") == 0) + return host_vm_kvm; + + sscanf(pde->name, "%d", &id); + return get_kvm_by_id(id); +} + +#define MAX_SCHEDULE_PARAMS 50 +static int kvm_scheduler_write(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + char scheduler_param[MAX_SCHEDULE_PARAMS] = { '\0'}; + struct kvm* kvm = get_proc_kvm(file); + int i; + int r = count; + unsigned long flags; + + get_cpu(); + + if(!kvm) { + r = -EINVAL; + goto quit; + } + + if (count > MAX_SCHEDULE_PARAMS - 1 ) { + r = -EINVAL; + goto quit; + } + if(copy_from_user(scheduler_param, buffer, count)) { + r = -EFAULT; + goto quit; + } + scheduler_param[count] = '\0'; + + for(i=0; i < KVM_MAX_VCPUS; i++){ + struct kvm_vcpu *v = kvm->vcpus[i]; + if(v && v != current_vcpu) + vcpu_pause(v); + } + + if ( kvm == current_vcpu->kvm) { + local_irq_save(flags); + if(!vcpu_schedule_try_lock(current_vcpu)) { + local_irq_restore(flags); + r = -EAGAIN; + goto quit_with_unpause; + } + } + + if ( (SCHED_OP(write_schedule_info, kvm, scheduler_param)) == 0 ) + printk("failed write schedule info!\n"); + + if ( kvm == current_vcpu->kvm) { + vcpu_schedule_unlock(current_vcpu); + local_irq_restore(flags); + } + +quit_with_unpause: + for(i=0; i < KVM_MAX_VCPUS; i++){ + struct kvm_vcpu *v = kvm->vcpus[i]; + if(v && v != current_vcpu) + vcpu_unpause(v); + } +quit: + put_cpu(); + return r; +} + +static ssize_t kvm_scheduler_read(struct file *file, + char __user * buffer, size_t count, loff_t *ppos) +{ + int res; + struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct kvm *kvm; + int i; + char *msg; + unsigned long flags; + + get_cpu(); + + msg = kmalloc(256, GFP_KERNEL); + if (!msg) { + res = -ENOMEM; + goto quit; + } + + kvm = get_proc_kvm(file); + if(!kvm) { + sprintf(msg, "file /proc/%s/%s does not have a corresponding kvm!\n", + KVM_SCHEDULER, pde->name); + goto quit_with_msg; + } + + for(i=0; i < KVM_MAX_VCPUS; i++){ + struct kvm_vcpu *v = kvm->vcpus[i]; + if(v && v != current_vcpu) + vcpu_pause(v); + } + + if ( kvm == current_vcpu->kvm){ + local_irq_save(flags); + if(!vcpu_schedule_try_lock(current_vcpu)) { + local_irq_restore(flags); + snprintf(msg, 256, + "scheduler is locked by current domain. Try again!\n"); + goto quit_with_unpause; + } + } + + if ( (SCHED_OP(read_schedule_info, kvm, msg, 256)) == 0 ) + printk("failed read schedule info!\n"); + + if ( kvm == current_vcpu->kvm) { + vcpu_schedule_unlock(current_vcpu); + local_irq_restore(flags); + } + +quit_with_unpause: + for(i=0; i < KVM_MAX_VCPUS; i++){ + struct kvm_vcpu *v = kvm->vcpus[i]; + if(v && v != current_vcpu) + vcpu_unpause(v); + } + +quit_with_msg: + res = simple_read_from_buffer(buffer, count, ppos, msg, strlen(msg)); + kfree(msg); +quit: + put_cpu(); + return res; + +} +static const struct file_operations kvm_scheduler_ops = { + .read = kvm_scheduler_read, + .write = kvm_scheduler_write, +}; +int sched_init_vm(struct kvm *kvm) +{ + struct proc_dir_entry *entry; + int error = 0; + char name[MAX_PROCESSID_LEN]; + int r; + int cpu = get_cpu(); + + BUG_ON(kvm->vmid); + + if(unlikely(is_idle_vm(kvm))) + kvm->vmid = IDLE_VM_ID; + else if(unlikely(is_host_vm(kvm))) + kvm->vmid = HOST_VM_ID; + else { + kvm->vmid = (unsigned long)current->pid; + if(kvm->vmid > MAX_PROCESSID) { + printk("process id %lx too big. only the lower 32bits are used!\n", + kvm->vmid); + kvm->vmid &= 0x0ffffffff; + } + } + spin_lock(&sched_vm_list_lock); + list_add(&kvm->vm_link, &sched_vm_list); + spin_unlock(&sched_vm_list_lock); + + if(!is_idle_vm(kvm) && (kvm_scheduler_root != NULL)) { + if(!is_host_vm(kvm)) + sprintf(name, "%d", kvm->vmid); + else + sprintf(name, "host"); + entry = create_proc_entry(name, S_IRUGO | S_IWUGO | S_IFREG, + kvm_scheduler_root); + if (entry) + entry->proc_fops = &kvm_scheduler_ops; + else { + printk("failed to create vm %x's corresponding procfs\n", kvm->vmid); + printk("user will not be able to control its scheduling!\n"); + kvm->vmid = ANONY_VM_ID; + } + } + + r = SCHED_OP(init_vm, kvm); + put_cpu(); + return r; +} + +void sched_destroy_vm(struct kvm *d) +{ + get_cpu(); + + char name[MAX_PROCESSID_LEN]; + + spin_lock(&sched_vm_list_lock); + list_del(&d->vm_link); + spin_unlock(&sched_vm_list_lock); + + if(d->vmid){ + if(is_host_vm(d)) + sprintf(name, "host"); + else + sprintf(name, "%d", d->vmid); + remove_proc_entry(name, kvm_scheduler_root); + } + + SCHED_OP(destroy_vm, d); + put_cpu(); +} + +void vcpu_sleep_nosync(struct kvm_vcpu *v) +{ + unsigned long flags; + unsigned long my_flags; + + get_cpu(); + local_irq_save(my_flags); + vcpu_schedule_lock_irqsave(v, flags); + + if ( likely(!vcpu_runnable(v)) ) + { + if ( v->runstate.state == RUNSTATE_runnable ) + vcpu_runstate_change(v, RUNSTATE_offline, NOW()); + + SCHED_OP(sleep, v); + } + + vcpu_schedule_unlock_irqrestore(v, flags); + local_irq_restore(my_flags); + put_cpu(); + + TRACE_1D(TRC_SCHED_SLEEP, v->vcpu_id); +} +void sync_vcpu_execstate(struct kvm_vcpu *v) +{ + printk("TODO: func %s called. \n", __FUNCTION__); +} +void vcpu_sleep_sync(struct kvm_vcpu *v) +{ + vcpu_sleep_nosync(v); + + while ( !vcpu_runnable(v) && v->is_running ) + cpu_relax(); + + sync_vcpu_execstate(v); +} + +void vcpu_wake(struct kvm_vcpu *v) +{ + unsigned long flags; + unsigned long my_flags; + int tmp_cpu = get_cpu();; + local_irq_save(my_flags); + vcpu_schedule_lock_irqsave(v, flags); + TRACE_2D(TRC_SCHED_WAKE, tmp_cpu, v); + + if ( likely(vcpu_runnable(v)) ) + { + if ( v->runstate.state >= RUNSTATE_blocked ) + vcpu_runstate_change(v, RUNSTATE_runnable, NOW()); + SCHED_OP(wake, v); + } + else if ( !test_bit(_VPF_blocked, &v->pause_flags) ) + { + if ( v->runstate.state == RUNSTATE_blocked ) + vcpu_runstate_change(v, RUNSTATE_offline, NOW()); + } + + vcpu_schedule_unlock_irqrestore(v, flags); + local_irq_restore(my_flags); + + TRACE_3D(TRC_SCHED_WAKE, tmp_cpu, v->vcpu_id, __LINE__); + put_cpu(); +} +static void vcpu_migrate(struct kvm_vcpu *v) +{ + unsigned long flags; + unsigned long my_flags; + int old_cpu; + int tmp_cpu = get_cpu();; + + local_irq_save(my_flags); + vcpu_schedule_lock_irqsave(v, flags); + + /* + * NB. Check of v->running happens /after/ setting migration flag + * because they both happen in (different) spinlock regions, and those + * regions are strictly serialised. + */ + if ( v->is_running || + !test_and_clear_bit(_VPF_migrating, &v->pause_flags) ) + { + vcpu_schedule_unlock_irqrestore(v, flags); + local_irq_restore(my_flags); + put_cpu(); + return; + } + + /* Switch to new CPU, then unlock old CPU. */ + old_cpu = v->processor; + v->processor = SCHED_OP(pick_cpu, v); + + /* because v->cpu is changed, + *we should not use vcpu_schedule_unlock_restore() + */ + BUG_ON(old_cpu != tmp_cpu); + spin_unlock(&per_cpu(schedule_data, old_cpu).schedule_lock); + pseudo_irq_restore(flags); + local_irq_restore(my_flags); + /* Wake on new CPU. */ + vcpu_wake(v); + put_cpu(); +} + +/* + * Force a VCPU through a deschedule/reschedule path. + * For example, using this when setting the periodic timer period means that + * most periodic-timer state need only be touched from within the scheduler + * which can thus be done without need for synchronisation. + */ +void vcpu_force_reschedule(struct kvm_vcpu *v) +{ + printk("TODO: func %s called\n", __FUNCTION__); +} + +int vcpu_set_affinity(struct kvm_vcpu *v, cpumask_t *affinity) +{ + printk("TODO: func %s called\n", __FUNCTION__); + return 0; +} + +/* Block the currently-executing domain until a pertinent event occurs. */ +static long do_block(void) +{ + printk("TODO: func %s called\n", __FUNCTION__); + return 0; +} + +/* Voluntarily yield the processor for this allocation. */ +static long do_yield(void) +{ + TRACE_1D(TRC_SCHED_YIELD, current_vcpu->vcpu_id); + tasklet_schedule(&per_cpu(schedule_data, raw_smp_processor_id()).sched_tasklet); + return 0; +} +long do_sched_op_compat(int cmd, unsigned long arg) +{ + long ret = 0; + printk("TODO: unimplemented func %s \n", __FUNCTION__); + return ret; +} + +typedef long ret_t; + +#endif /* !COMPAT */ + +#ifndef COMPAT + +/* sched_id - fetch ID of current scheduler */ +int sched_id(void) +{ + return ops.sched_id; +} + + +void continue_running(struct kvm_vcpu *vcpu) +{ +} +void context_saved(struct kvm_vcpu *prev) +{ + prev->is_running = 0; + cmpxchg(&prev->status, VCPU_RUNNING, VCPU_YIELD); + + if(unlikely(test_bit(_VPF_migrating, &prev->pause_flags))){ + vcpu_migrate(prev); + } +} +void vm_context_switch(struct kvm_vcpu *prev, struct kvm_vcpu *next) +{ + context_saved(prev); + cmpxchg(&next->status, VCPU_YIELD, VCPU_RUNNING); + wake_up(&next->wq); +} +/* Return Value: 0 + * meaning: upon completion, the pseudo_irq remains disabled + */ +static int __vcpu_schedule(int my_cpu) +{ + struct kvm_vcpu *prev = current_vcpu, *next = NULL; + s_time_t now; + ktime_t ktime_now; + struct schedule_data *sd; + struct task_slice next_slice; + s32 r_time; /* time for new dom to run */ + static int cnt=0; + unsigned long tmp_flags; + + BUG_ON(thread_preemptible()); + + ASSERT(!in_irq()); + + sd = &per_cpu(schedule_data, my_cpu); + + /* if the kvm module is to be unloaded, the lock might fail */ + if(cmpxchg(&sd->in_use, false, true) != false) + return 0; + + spin_lock(&sd->schedule_lock); + + hrtimer_cancel(&sd->s_timer); + ktime_now = sd->s_timer.base->get_time(); + now = ktime_to_ns(ktime_now); + + /* get policy-specific decision on scheduling... */ + next_slice = ops.do_schedule(now); + + r_time = next_slice.time; + next = next_slice.task; + + sd->curr = next; + + if(unlikely(shutting_down)) { + spin_unlock(&sd->schedule_lock); + goto finished; + } + + if ( unlikely(prev == next) ) + { + spin_unlock(&sd->schedule_lock); + + continue_running(prev); + goto finished; + } + + TRACE_1D(TRC_SCHED_SWITCH_INFPREV, + // prev->domain->domain_id, + now - prev->runstate.state_entry_time); + TRACE_2D(TRC_SCHED_SWITCH_INFNEXT, + // next->domain->domain_id, + (next->runstate.state == RUNSTATE_runnable) ? + (now - next->runstate.state_entry_time) : 0, + r_time); + + ASSERT(prev->runstate.state == RUNSTATE_running); + vcpu_runstate_change( + prev, + (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked : + (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)), + now); + + ASSERT(next->runstate.state != RUNSTATE_running); + vcpu_runstate_change(next, RUNSTATE_running, now); + + ASSERT(!next->is_running); + next->is_running = 1; + + spin_unlock(&sd->schedule_lock); + + TRACE_3D(TRC_SCHED_SWITCH, __LINE__, prev->vcpu_id, next->vcpu_id); + vm_context_switch(prev, next); +finished: + hrtimer_cancel(&sd->watchdog); + if (!shutting_down) { + hrtimer_start(&sd->s_timer, ktime_add_ns(ktime_now, r_time), HRTIMER_MODE_ABS); + + /* restart the watchdog */ + ktime_now = sd->watchdog.base->get_time(); + hrtimer_start(&sd->watchdog, ktime_add_ns(ktime_now, WATCHDOG_NS), HRTIMER_MODE_ABS); + } + sd->in_use = false; + return 0; +} +/* + * The main function + * - deschedule the current domain (scheduler independent). + * - pick a new domain (scheduler dependent). + */ +static void vcpu_schedule(unsigned long _unused) +{ + + int my_cpu = raw_smp_processor_id(); + // tasklet_disable(&per_cpu(schedule_data, my_cpu).tick_tasklet); +try_again: + while(per_cpu(schedule_data, my_cpu).sched_state == SCHEDULER_USER) + ; + if(per_cpu(schedule_data, my_cpu).sched_state == SCHEDULER_KERNEL) { + /* for the case that others has entered the critical section + * we abort scheduling the vcpus this time + */ + struct schedule_data *sd = &per_cpu(schedule_data, my_cpu); + ktime_t now = sd->s_timer.base->get_time(); + + hrtimer_cancel(&sd->s_timer); + if(!shutting_down) + hrtimer_start(&sd->s_timer, ktime_add_ns(now, 10000), HRTIMER_MODE_ABS); + // tasklet_enable(&per_cpu(schedule_data, my_cpu).tick_tasklet); + return; + } + if(cmpxchg(&per_cpu(schedule_data, my_cpu).sched_state, + SCHEDULER_FREE, SCHEDULER_KERNEL) != SCHEDULER_FREE) + goto try_again; + + preempt_disable(); + __vcpu_schedule(my_cpu); + per_cpu(schedule_data, my_cpu).sched_state = SCHEDULER_FREE; + + // tasklet_enable(&per_cpu(schedule_data, my_cpu).tick_tasklet); + preempt_enable(); + // put_cpu(); + + return; +} + +void kvm_force_tasklet_schedule(void *_unused) +{ + get_cpu(); + tasklet_schedule(&per_cpu(schedule_data, raw_smp_processor_id()).sched_tasklet); + put_cpu(); +} +/* The scheduler timer: force a run through the scheduler */ +static enum hrtimer_restart s_timer_fn(struct hrtimer* timer) +{ + int cpu = raw_smp_processor_id(); + struct schedule_data *sd = container_of(timer, struct schedule_data, s_timer); + if(cpu != sd->id){ + int r; + r = insert_pending_ipi(cpu, cpumask_of_cpu(sd->id), kvm_force_tasklet_schedule, NULL, 1); + // tasklet_schedule(&sd->ipi_tasklet); + wake_up(&sd->ipi_wq); + if(r){ + dump_traces(NULL); + BUG_ON(1); + } + }else{ + per_cpu(schedule_data, raw_smp_processor_id()).can_migrate = !in_atomic_preempt_off(); + tasklet_schedule(&per_cpu(schedule_data, cpu).sched_tasklet); + } + /* no need to restart the timer. the vcpu_schedule() will do it */ + return HRTIMER_NORESTART; +} +static void per_cpu_stop_sched(int cpu) +{ + struct schedule_data* sd = &per_cpu(schedule_data, cpu); + hrtimer_cancel(&sd->s_timer); + hrtimer_cancel(&sd->watchdog); +} +static void per_cpu_kill_scheduler(int cpu) +{ + struct schedule_data* sd = &per_cpu(schedule_data, cpu); + tasklet_kill(&sd->sched_tasklet); + +} + +bool shutting_down = false; +void stop_auto_schedule(void) +{ + int i; + + shutting_down = true; + barrier(); + + for_each_online_cpu(i) { + struct schedule_data *sd = &per_cpu(schedule_data, i); + + sd->ipi_quit = true; + wake_up(&sd->ipi_wq); + printk("waiting ipi helper stop!\n"); + while(sd->ipi_quit) schedule(); + printk("ipi helper stopped!\n"); + SCHED_OP(stop_schedule, i); + per_cpu_stop_sched(i); + + } +} +static inline void scheduler_disabled(int cpu) +{ + struct schedule_data *sd = &per_cpu(schedule_data, cpu); + per_cpu_kill_scheduler(cpu); +} +void wait_scheduler_stops(void) +{ + int i; + for_each_online_cpu(i) { + SCHED_OP(disable_scheduler, i); + scheduler_disabled(i); + } +} +void scheduler_start(void) +{ + int i; + + for_each_online_cpu(i) { + if(SCHED_OP(start_scheduler, i)) { + printk("error start scheduler %d!\n", i); + BUG_ON(1); + } + } +} +void scheduler_destroy(void) +{ + int i; + + if(kvm_scheduler_root) + remove_proc_entry("kvm", NULL); + + for_each_online_cpu(i) { + destroy_pending_ipi_buf(i); + } +} + +extern void inject_pending_ipi(void); +static void per_cpu_init_sched(int cpu) +{ + struct pending_work * pd_work; + ktime_t now; + struct schedule_data *sd = &per_cpu(schedule_data, cpu); + struct task_struct *task; + char name[]="ipi_helper_0"; + + sd->sched_state = SCHEDULER_FREE; + tasklet_init(&sd->sched_tasklet, vcpu_schedule, (unsigned long)NULL); + + name[strlen(name)-1] += cpu; + init_waitqueue_head(&sd->ipi_wq); + sd->ipi_quit = false; + task = kthread_run(inject_pending_ipi, cpu, name); + if(IS_ERR(task)){ + printk("error creating help thread %d\n", cpu); + BUG_ON(1); + } + kvm_sched_setaffinity(task->pid, cpumask_of_cpu(cpu)); + + rwlock_init(&per_cpu(pseudo_cli, cpu)); + spin_lock_init(&sd->schedule_lock); + sd->in_use = false; + sd->can_migrate = false; + sd->id = cpu; + + /* the scheduler timer */ + hrtimer_init(&sd->s_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + sd->s_timer.function = s_timer_fn; + // hrtimer_data_pointer(&sd->s_timer); + + /* the watchdog timer */ + hrtimer_init(&sd->watchdog, + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + sd->watchdog.function = dump_cpu_trace; + // hrtimer_data_pointer(&sd->watchdog); + + /* start the watchdog */ + now = sd->watchdog.base->get_time(); + if(hrtimer_start(&sd->watchdog, + ktime_add_ns(now, WATCHDOG_NS), HRTIMER_MODE_ABS)) { + printk("start watchdog timer failed!\n"); + BUG_ON(1); + } +} + +/* Initialise the data structures. */ +void scheduler_init(void) +{ + int i; + + INIT_LIST_HEAD(&sched_vm_list); + spin_lock_init(&sched_vm_list_lock); + + kvm_scheduler_root = proc_mkdir(KVM_SCHEDULER, NULL); + if (!kvm_scheduler_root){ + printk("fail to create the /proc/kvm procfs!\n"); + printk("the user scheduler control function will be disabled!\n"); + } + + + for_each_online_cpu(i) + per_cpu_init_sched(i); + + ops = *schedulers[0]; + SCHED_OP(init); +} + +void dump_runq(unsigned char key) +{ + s_time_t now = NOW(); + int i; + unsigned long flags; + + local_irq_save(flags); + + printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name); + SCHED_OP(dump_settings); + printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now); + + for_each_online_cpu ( i ) + { + spin_lock(&per_cpu(schedule_data, i).schedule_lock); + printk("CPU[%02d] ", i); + SCHED_OP(dump_cpu_state, i); + spin_unlock(&per_cpu(schedule_data, i).schedule_lock); + } + + local_irq_restore(flags); +} +

patch for virtual machine oriented scheduling(3)

Commit Message

Comments

Patch