From patchwork Wed Apr 22 15:01:23 2009 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: alex X-Patchwork-Id: 19388 Received: from vger.kernel.org (vger.kernel.org [209.132.176.167]) by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id n3MF1Smg019972 for ; Wed, 22 Apr 2009 15:01:28 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751861AbZDVPB0 (ORCPT ); Wed, 22 Apr 2009 11:01:26 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1751964AbZDVPB0 (ORCPT ); Wed, 22 Apr 2009 11:01:26 -0400 Received: from rv-out-0506.google.com ([209.85.198.239]:11722 "EHLO rv-out-0506.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751637AbZDVPBY (ORCPT ); Wed, 22 Apr 2009 11:01:24 -0400 Received: by rv-out-0506.google.com with SMTP id f9so8142rvb.1 for ; Wed, 22 Apr 2009 08:01:23 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma; h=domainkey-signature:mime-version:received:date:message-id:subject :from:to:content-type:content-transfer-encoding; bh=3DMmpGJEdJwWCz+C1N55vqkJwcYVi+euh76PyRu3joI=; b=nux6PwMUs6lF499WNn8cMpq2mFoCOj7NlIjVl5f2wrrhMKx/7i62OvACUKqVAY57Fk L88YidkU1GjV/rrcPgCL0N//oLImthkiLmsUPZlfI36hiNmTm53Q/nn97aMybOGSigX5 ueRBJBt+Ac7ddcWiVu272paf/Y1ZUbquyKFD8= DomainKey-Signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma; h=mime-version:date:message-id:subject:from:to:content-type :content-transfer-encoding; b=sCvR3KhCRmA3xRX+BnjF4MUI6fcGsM79A4xsP90IMwIC/o7F8oXmbIoqvqMMK+OgEV txayw+gDLCWRQr+zVcFUr/jzRL7fStEL2puvZJpTE1JnP/EAA5F58OKeArwK6z09s8nW 2YPd2O2gF6Ebvg9ADEYnBXf+I2wm/z2viHitY= MIME-Version: 1.0 Received: by 10.142.166.7 with SMTP id o7mr1271961wfe.257.1240412483657; Wed, 22 Apr 2009 08:01:23 -0700 (PDT) Date: Wed, 22 Apr 2009 23:01:23 +0800 Message-ID: <820ac2e90904220801yb2bbebah80390de935dbecc8@mail.gmail.com> Subject: patch for virtual machine oriented scheduling(6) From: alex To: avi@redhat.com, anthony@codemonkey.ws, kvm@vger.kernel.org Sender: kvm-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm@vger.kernel.org the "myins" tool and the related head files: ------------------------------------------------------------------------------------- --- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html diff --git a/arch/x86/myins b/arch/x86/myins new file mode 100755 index 0000000..ce6d97e --- /dev/null +++ b/arch/x86/myins @@ -0,0 +1,6 @@ +#!/bin/sh +rmmod kvm_intel +rmmod kvm + +insmod $1/kvm.ko +insmod $1/kvm-intel.ko setaffinity=0x`grep -w sched_setaffinity /proc/kallsyms | cut -d" " -f1 ` diff --git a/include/linux/ipi.h b/include/linux/ipi.h new file mode 100644 index 0000000..2dcc0f1 --- /dev/null +++ b/include/linux/ipi.h @@ -0,0 +1,9 @@ +#ifndef IPI_H +#define IPI_H +extern void init_pending_ipi_buf(int cpu); +extern void destroy_pending_ipi_buf(int cpu); +extern void preempt_safe_send_ipi(cpumask_t mask, void (*func)(void*), void* data, int sync); +extern int insert_pending_ipi(int cpu, cpumask_t mask, void (*func)(void*), void* data, int sync); +extern bool pending_ipi_buf_empty(int cpu); + +#endif diff --git a/include/linux/kvm.h b/include/linux/kvm.h index ee755e2..bb4d761 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -14,6 +14,11 @@ #define KVM_API_VERSION 12 +/* any process' pid should be less than 0xffffffff */ +#define MAX_PROCESSID_LEN 11 // the length of string "4294967295"; +#define MAX_PROCESSID ((unsigned long)0x0ffffffff) + + /* for KVM_TRACE_ENABLE */ struct kvm_user_trace_setup { __u32 buf_size; /* sub_buffer size of each per-cpu */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 095ebb6..e0fd68c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -63,6 +63,48 @@ struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev); + +/* VCPU is currently running on a physical CPU. */ +#define RUNSTATE_running 0 + +/* VCPU is runnable, but not currently scheduled on any physical CPU. */ +#define RUNSTATE_runnable 1 + +/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */ +#define RUNSTATE_blocked 2 + +/* + * VCPU is not runnable, but it is not blocked. + * This is a 'catch all' state for things like hotplug and pauses by the + * system administrator (or for critical sections in the hypervisor). + * RUNSTATE_blocked dominates this state (it is the preferred state). + */ +#define RUNSTATE_offline 3 + + +/* + * vcpu status + * there are 2 variables representing the vcpu status. + * pause_flags: scheduler uses this to choose a ready vcpu. + * if a vcpu is to run, its status is set to be VCPU_RUNNING + * if a vcpu is scheduled out, its status is set to be VCPU_YIELD + * status: the linux kernel use this varible to determine if a vcpu is RUNNABLE + */ +#define VCPU_RUNNING 1 // the vcpu that is running +#define VCPU_YIELD 2 // the vcpu should give up cpu + +struct vcpu_runstate_info { + /* VCPU's current state (RUNSTATE_*). */ + int state; + /* When was current state entered (system time, ns)? */ + uint64_t state_entry_time; + /* + * Time spent in each RUNSTATE_* (ns). The sum of these times is + * guaranteed not to drift from system time. + */ + uint64_t time[4]; +}; + struct kvm_vcpu { struct kvm *kvm; #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -92,6 +134,21 @@ struct kvm_vcpu { #endif struct kvm_vcpu_arch arch; + + /* Added data structures */ + volatile unsigned int status; + bool is_running; /* TODO: these two variables are identical! */ + + // struct hrtimer timer; + struct task_struct* thread; + cpumask_t cpu_affinity; + + unsigned long pause_flags; + atomic_t pause_count; + struct vcpu_runstate_info runstate; + void *sched_priv; /* scheduler-specific data */ + bool set_rt; + int processor; /* the processor that scheduler thinks */ }; struct kvm_memory_slot { @@ -122,6 +179,10 @@ struct kvm_kernel_irq_routing_entry { struct list_head link; }; +typedef pid_t vmid_t; +#define IDLE_VM_ID ((vmid_t)0x0ffffffff) +#define HOST_VM_ID 1 +#define ANONY_VM_ID 0 struct kvm { struct mutex lock; /* protects the vcpus array and APIC accesses */ spinlock_t mmu_lock; @@ -152,6 +213,12 @@ struct kvm { unsigned long mmu_notifier_seq; long mmu_notifier_count; #endif + void *sched_priv; /*scheduler specific data */ + atomic_t pause_count; + bool is_paused_by_controller; + bool is_dying; + vmid_t vmid; + struct list_head vm_link; }; /* The guest did something we don't support. */ @@ -165,6 +232,12 @@ struct kvm { #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt) #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt) +static inline int vcpu_runnable(struct kvm_vcpu *v) +{ + return !(v->pause_flags | + atomic_read(&v->pause_count) | + atomic_read(&v->kvm->pause_count)); +} int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); @@ -178,6 +251,12 @@ void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); void kvm_put_kvm(struct kvm *kvm); +extern long (*sched_setaffinity_p)(pid_t pid, cpumask_t* new_mask); +static inline long kvm_sched_setaffinity(pid_t pid, cpumask_t new_mask) +{ + return sched_setaffinity_p(pid, &new_mask); +} + #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } @@ -524,4 +603,11 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {} #endif +#define test_and_set_bool(b) xchg(&(b), 1) +#define test_and_clear_bool(b) xchg(&(b), 0) +static inline s64 NOW(void) +{ + struct timespec t = current_kernel_time(); + return timespec_to_ns(&t); +} #endif diff --git a/include/linux/sched-if.h b/include/linux/sched-if.h new file mode 100644 index 0000000..d966cd3 --- /dev/null +++ b/include/linux/sched-if.h @@ -0,0 +1,225 @@ +#ifndef SCHED_IF_H +#define SCHED_IF_H + +#include + +#if 0 +extern long (*sched_setaffinity_p)(pid_t pid, cpumask_t new_mask); +static inline long kvm_sched_setaffinity(pid_t pid, cpumask_t new_mask) +{ + return sched_setaffinity_p(pid, new_mask); +} +extern long (*sched_setaffinity_p)(pid_t pid, cpumask_t* in_mask); +static inline long kvm_sched_setaffinity(pid_t pid, cpumask_t new_mask) +{ + return sched_setaffinity_p(pid, &new_mask); +} +#endif + +#define IDLE_VM ((unsigned int)(-1)) +#define NORMAL_VM 0 +#define HOST_VM 1 + +#define MAX_PARAMS 1 +DECLARE_PER_CPU(rwlock_t, pseudo_cli); + +#ifdef CONFIG_PREEMPT +#define thread_preemptible() (preempt_count() == 0) +#else +#define thread_preemptible() 0 +#endif + +static inline int vcpu_schedule_try_lock(struct kvm_vcpu *v) +{ + unsigned int cpu; + struct schedule_data *sd; + + for ( ; ; ){ + int r; + cpu = v->processor; + sd = &per_cpu(schedule_data, cpu); + r = spin_trylock(&sd->schedule_lock); + if (!r) return 0; + if (likely(v->processor == cpu)) + return 1; + spin_unlock(&sd->schedule_lock); + } +} +static inline struct schedule_data* vcpu_schedule_lock(struct kvm_vcpu *v) +{ + unsigned int cpu; + struct schedule_data *sd; + + for ( ; ; ) + { + cpu = v->processor; + sd = &per_cpu(schedule_data, cpu); + spin_lock(&sd->schedule_lock); + if ( likely(v->processor == cpu) ){ + return sd; + } + spin_unlock(&sd->schedule_lock); + } +} + +static inline int pseudo_irq_cli(void) +{ + int cpu = raw_smp_processor_id(); + struct schedule_data *sd = &per_cpu(schedule_data, cpu); + + tasklet_disable(&sd->sched_tasklet); + tasklet_disable(&sd->tick_tasklet); + + /* maybe, the tasklet is already running now, we try to lock + * sched_state to detect this case + */ + while(cmpxchg(&sd->sched_state, + SCHEDULER_FREE, SCHEDULER_USER) != SCHEDULER_FREE) + schedule(); + + return 1; +} +static inline int pseudo_irq_save(int flags) +{ + int cpu = raw_smp_processor_id(); + struct schedule_data *sd = &per_cpu(schedule_data, cpu); + if(thread_preemptible()){ + BUG_ON(1); + }; + + tasklet_disable(&sd->sched_tasklet); + tasklet_disable(&sd->tick_tasklet); + + /* maybe, the tasklet is already running now, we try to lock + * sched_state to detect this case + */ + while(cmpxchg(&sd->sched_state, + SCHEDULER_FREE, SCHEDULER_USER) != SCHEDULER_FREE) + schedule(); + + return 1; +} +static inline void pseudo_irq_sti(void) +{ + int cpu = raw_smp_processor_id(); + struct schedule_data *sd = &per_cpu(schedule_data, cpu); + if(thread_preemptible()){ + BUG_ON(1); + }; + sd->sched_state = SCHEDULER_FREE; + barrier(); + tasklet_enable(&sd->sched_tasklet); + tasklet_enable(&sd->tick_tasklet); +} + +static inline void pseudo_irq_restore(int flags) +{ + pseudo_irq_sti(); +} + +#define vcpu_schedule_lock_irqsave(v, flags) do { \ + struct schedule_data *sd; \ + int r = pseudo_irq_save(flags); \ + BUG_ON(thread_preemptible()); \ + if(!r) { \ + BUG_ON(1); \ + }; \ + sd = vcpu_schedule_lock((v)); \ +} while ( 0 ) + +#define vcpu_schedule_lock_irq(v) do { \ + struct schedule_data *sd; \ + int r; \ + BUG_ON(thread_preemptible()); \ + r = pseudo_irq_cli(); \ + if(!r) { \ + BUG_ON(1); \ + }; \ + sd = vcpu_schedule_lock((v)); \ +} while ( 0 ) + +static inline void vcpu_schedule_unlock(struct kvm_vcpu *v) +{ + spin_unlock(&per_cpu(schedule_data, v->processor).schedule_lock); +} + +#define vcpu_schedule_unlock_irq(v) do { \ + vcpu_schedule_unlock(v); \ + pseudo_irq_sti(); \ +} while ( 0 ) +#define vcpu_schedule_unlock_irqrestore(v, flags) do { \ + vcpu_schedule_unlock(v); \ + pseudo_irq_restore(flags); \ +} while ( 0 ) + +struct kvm; +struct scheduler { + char *name; /* full name for this scheduler */ + char *opt_name; /* option name for this scheduler */ + unsigned int sched_id; /* ID for this scheduler */ + + void (*init) (void); + + int (*init_vm) (struct kvm*); + void (*destroy_vm) (struct kvm*); + + int (*init_vcpu) (struct kvm_vcpu *); + void (*destroy_vcpu) (struct kvm_vcpu *); + + void (*sleep) (struct kvm_vcpu *); + void (*wake) (struct kvm_vcpu *); + + struct task_slice (*do_schedule) (s_time_t); + + void (*disable_scheduler) (int cpu); + int (*start_scheduler) (int cpu); + + void (*stop_schedule) (int cpu); + + int (*pick_cpu) (struct kvm_vcpu *); + int (*read_schedule_info) (struct kvm*, char*, int sz); + int (*write_schedule_info) (struct kvm*, char*); + void (*dump_settings) (void); + void (*dump_cpu_state) (int); +}; + +extern struct kvm *idle_vm_kvm; +extern struct kvm *host_vm_kvm; +#define is_idle_vm(kvm) ((kvm) == idle_vm_kvm) +#define is_host_vm(kvm) ((kvm) == host_vm_kvm) + +extern bool shutting_down; + +#define is_idle_vcpu(vcpu) (is_idle_vm((vcpu)->kvm)) +#define is_host_vcpu(vcpu) (is_host_vm((vcpu)->kvm)) + +#define _VPF_blocked 0 +#define VPF_blocked (1UL<<_VPF_blocked) + + /* VCPU is offline. */ +#define _VPF_down 1 +#define VPF_down (1UL<<_VPF_down) + +#define _VPF_migrating 3 +#define VPF_migrating (1UL<<_VPF_migrating) + +extern void sched_destroy_vcpu(struct kvm_vcpu *v); +extern void sched_destroy_vm(struct kvm *); +extern int sched_init_vcpu(struct kvm_vcpu *v, unsigned int processor); +extern int sched_init_vm(struct kvm *kvm); +extern void vcpu_sleep_nosync(struct kvm_vcpu *v); +extern void vcpu_sleep_sync(struct kvm_vcpu *v); +extern void vcpu_wake(struct kvm_vcpu *v); +extern void vm_pause(struct kvm *kvm); +extern void vm_unpause(struct kvm *kvm); +extern void scheduler_init(void); +extern void wait_scheduler_stops(void); +extern void scheduler_destroy(void); +extern void scheduler_stop_tickers(void); +extern void vm_pause_by_systemcontroller(struct kvm* kvm); +extern void vm_unpause_by_systemcontroller(struct kvm* kvm); +extern void stop_auto_schedule(void); +extern void scheduler_start(void); + +#define current_vcpu (per_cpu(schedule_data, raw_smp_processor_id()).curr) +#endif diff --git a/include/linux/schedule.h b/include/linux/schedule.h new file mode 100644 index 0000000..896da0a --- /dev/null +++ b/include/linux/schedule.h @@ -0,0 +1,35 @@ +#ifndef _SCHEDULE_H +#define _SCHEDULE_H +#include + +typedef s64 s_time_t; + +struct task_slice { + struct kvm_vcpu *task; + s_time_t time; +}; + +#define SCHEDULER_FREE 0 /* none is using the scheduler */ +#define SCHEDULER_USER 1 /* it is used by from user requirement*/ +#define SCHEDULER_KERNEL 2 /* it is used by scheduler or ticker */ + +struct schedule_data { + spinlock_t schedule_lock; /* spinlock protecting curr */ + struct kvm_vcpu *curr; /* current task */ + struct kvm_vcpu *idle; /* idle task for this cpu */ + void *sched_priv; + struct hrtimer s_timer; /* scheduling timer */ + int id; /* the cpu id */ + struct hrtimer watchdog; /* the watchdog timer */ + struct tasklet_struct sched_tasklet; /* per cpu schedule tasklet */ + wait_queue_head_t ipi_wq; /* ipi helper thread waitqueue */ + volatile bool ipi_quit; /* the ipi helper should quit */ + struct tasklet_struct tick_tasklet; /* per cpu tick tasklet */ + volatile int sched_state; /* the scheduler status */ + volatile bool in_use; /* indicate the whether the schedule can work*/ + volatile bool can_migrate; +}; + +DECLARE_PER_CPU(struct schedule_data, schedule_data); + +#endif diff --git a/include/linux/trace.h b/include/linux/trace.h new file mode 100755 index 0000000..3f64df6 --- /dev/null +++ b/include/linux/trace.h @@ -0,0 +1,60 @@ +#ifndef TRACE_H +#define TRACE_H + +#define NR_TRACES 500 +struct t_rec { + u64 cycles; /* local(and global too) cpu tsc */ + u32 event; /* event id */ + unsigned long data[5]; /* event data items */ +}; +struct trace_logger { + struct t_rec* buf; + spinlock_t lock; + int ptr; +}; +#define WATCHDOG_NS 5000000000 +extern int init_trace_buf(int cpu); +extern void free_trace_buf(int cpu); +extern enum hrtimer_restart dump_traces(void*); +extern enum hrtimer_restart dump_cpu_trace(struct hrtimer*); +extern void trace(u32 event, unsigned long d0, unsigned long d1, + unsigned long d2, unsigned long d3, unsigned long d4); + +#define TRC_SCHED_VM_ADD 0 +#define TRC_SCHED_SLEEP 1 +#define TRC_SCHED_WAKE 2 +#define TRC_SCHED_YIELD 3 +#define TRC_SCHED_SWITCH_INFPREV 4 +#define TRC_SCHED_SWITCH_INFNEXT 5 +#define TRC_SCHED_SWITCH 6 + +#define TRC_SCHED_TIMER 7 +#define TRC_SCHED_RELAYED_TIMER 8 +#define TRC_VCPU_SCHEDULE 9 +#define TRC_RUNQ_TICKLE 10 +#define TRC_TASKLET_SCHEDULE 11 +#define TRC_CSCHED_TICK 12 +#define TRC_CSCHED_SLEEP 13 +#define TRC_CSCHED_WAKE 14 +#define TRC_INTERNAL_CSCHED_TICK 15 +#define TRC_DO_PENDING 16 +#define TRC_PSEUDO_CLI 17 +#define TRC_PSEUDO_STI 18 +#define TRC_LOAD_BALANCE 19 +#define TRC_CSCHED_SCHEDULE 20 +#define TRC_PSEUDO_INTR 21 +#define TRC_PSEUDO_EOI 22 +#define TRC_PSEUDO_OPEN_INTR 23 +#define TRC_TIMER_FN 24 +#define TRC_SEND_IPI 25 +#define TRC_INJECT_PEND_IPI 26 +#define TRC_PEND_INTR 27 +#define TRC_RUNQ_SORT 28 + +#define TRACE_0D(e) trace((e), 0, 0, 0, 0, 0) +#define TRACE_1D(e,d) trace((e), (d), 0, 0, 0, 0) +#define TRACE_2D(e,d1,d2) trace((e), (d1), (d2), 0, 0, 0) +#define TRACE_3D(e,d1,d2,d3) trace((e), (d1), (d2), (d3), 0, 0) +#define TRACE_4D(e,d1,d2,d3,d4) trace((e), (d1), (d2), (d3), (d4), 0) +#define TRACE_5D(e,d1,d2,d3,d4,d5) trace((e), (d1), (d2), (d3), (d4), (d5)) +#endif