From patchwork Wed Apr 22 15:01:23 2009
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: alex <tomorrowanewday@gmail.com>
X-Patchwork-Id: 19388
Received: from vger.kernel.org (vger.kernel.org [209.132.176.167])
	by demeter.kernel.org (8.14.2/8.14.2) with ESMTP id n3MF1Smg019972
	for <patchwork-kvm@patchwork.kernel.org>;
	Wed, 22 Apr 2009 15:01:28 GMT
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S1751861AbZDVPB0 (ORCPT
	<rfc822;patchwork-kvm@patchwork.kernel.org>);
	Wed, 22 Apr 2009 11:01:26 -0400
Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1751964AbZDVPB0
	(ORCPT <rfc822;kvm-outgoing>); Wed, 22 Apr 2009 11:01:26 -0400
Received: from rv-out-0506.google.com ([209.85.198.239]:11722 "EHLO
	rv-out-0506.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
	with ESMTP id S1751637AbZDVPBY (ORCPT <rfc822; kvm@vger.kernel.org>);
	Wed, 22 Apr 2009 11:01:24 -0400
Received: by rv-out-0506.google.com with SMTP id f9so8142rvb.1
	for <kvm@vger.kernel.org>; Wed, 22 Apr 2009 08:01:23 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=gamma;
	h=domainkey-signature:mime-version:received:date:message-id:subject
	:from:to:content-type:content-transfer-encoding;
	bh=3DMmpGJEdJwWCz+C1N55vqkJwcYVi+euh76PyRu3joI=;
	b=nux6PwMUs6lF499WNn8cMpq2mFoCOj7NlIjVl5f2wrrhMKx/7i62OvACUKqVAY57Fk
	L88YidkU1GjV/rrcPgCL0N//oLImthkiLmsUPZlfI36hiNmTm53Q/nn97aMybOGSigX5
	ueRBJBt+Ac7ddcWiVu272paf/Y1ZUbquyKFD8=
DomainKey-Signature: a=rsa-sha1; c=nofws; d=gmail.com; s=gamma;
	h=mime-version:date:message-id:subject:from:to:content-type
	:content-transfer-encoding;
	b=sCvR3KhCRmA3xRX+BnjF4MUI6fcGsM79A4xsP90IMwIC/o7F8oXmbIoqvqMMK+OgEV
	txayw+gDLCWRQr+zVcFUr/jzRL7fStEL2puvZJpTE1JnP/EAA5F58OKeArwK6z09s8nW
	2YPd2O2gF6Ebvg9ADEYnBXf+I2wm/z2viHitY=
MIME-Version: 1.0
Received: by 10.142.166.7 with SMTP id o7mr1271961wfe.257.1240412483657;
	Wed, 22 Apr 2009 08:01:23 -0700 (PDT)
Date: Wed, 22 Apr 2009 23:01:23 +0800
Message-ID: <820ac2e90904220801yb2bbebah80390de935dbecc8@mail.gmail.com>
Subject: patch for virtual machine oriented scheduling(6)
From: alex <tomorrowanewday@gmail.com>
To: avi@redhat.com, anthony@codemonkey.ws, kvm@vger.kernel.org
Sender: kvm-owner@vger.kernel.org
Precedence: bulk
List-ID: <kvm.vger.kernel.org>
X-Mailing-List: kvm@vger.kernel.org

the "myins" tool and the related head files:
-------------------------------------------------------------------------------------
---
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

diff --git a/arch/x86/myins b/arch/x86/myins
new file mode 100755
index 0000000..ce6d97e
--- /dev/null
+++ b/arch/x86/myins
@@ -0,0 +1,6 @@
+#!/bin/sh
+rmmod kvm_intel
+rmmod kvm
+
+insmod $1/kvm.ko
+insmod $1/kvm-intel.ko setaffinity=0x`grep -w sched_setaffinity
/proc/kallsyms | cut -d" " -f1 `
diff --git a/include/linux/ipi.h b/include/linux/ipi.h
new file mode 100644
index 0000000..2dcc0f1
--- /dev/null
+++ b/include/linux/ipi.h
@@ -0,0 +1,9 @@
+#ifndef IPI_H
+#define IPI_H
+extern void init_pending_ipi_buf(int cpu);
+extern void destroy_pending_ipi_buf(int cpu);
+extern void preempt_safe_send_ipi(cpumask_t mask, void
(*func)(void*), void* data, int sync);
+extern int insert_pending_ipi(int cpu, cpumask_t mask, void
(*func)(void*), void* data, int sync);
+extern bool pending_ipi_buf_empty(int cpu);
+
+#endif
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ee755e2..bb4d761 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -14,6 +14,11 @@

 #define KVM_API_VERSION 12

+/* any process' pid should be less than 0xffffffff */
+#define MAX_PROCESSID_LEN  11 // the length of string "4294967295";
+#define MAX_PROCESSID  ((unsigned long)0x0ffffffff)
+
+
 /* for KVM_TRACE_ENABLE */
 struct kvm_user_trace_setup {
       __u32 buf_size; /* sub_buffer size of each per-cpu */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 095ebb6..e0fd68c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -63,6 +63,48 @@ struct kvm_io_device *kvm_io_bus_find_dev(struct
kvm_io_bus *bus,
 void kvm_io_bus_register_dev(struct kvm_io_bus *bus,
                            struct kvm_io_device *dev);

+
+/* VCPU is currently running on a physical CPU. */
+#define RUNSTATE_running  0
+
+/* VCPU is runnable, but not currently scheduled on any physical CPU. */
+#define RUNSTATE_runnable 1
+
+/* VCPU is blocked (a.k.a. idle). It is therefore not runnable. */
+#define RUNSTATE_blocked  2
+
+/*
+ * VCPU is not runnable, but it is not blocked.
+ * This is a 'catch all' state for things like hotplug and pauses by the
+ * system administrator (or for critical sections in the hypervisor).
+ * RUNSTATE_blocked dominates this state (it is the preferred state).
+ */
+#define RUNSTATE_offline  3
+
+
+/*
+  * vcpu status
+  * there are 2 variables representing the vcpu status.
+  *    pause_flags:    scheduler uses this to choose a ready vcpu.
+  *                    if a vcpu is to run, its status is set to be
VCPU_RUNNING
+  *                    if a vcpu is scheduled out, its status is set
to be VCPU_YIELD
+  *    status:         the linux kernel use this varible to determine if a
vcpu is RUNNABLE
+  */
+#define VCPU_RUNNING    1  // the vcpu that is running
+#define VCPU_YIELD     2  // the vcpu should give up cpu
+
+struct vcpu_runstate_info {
+    /* VCPU's current state (RUNSTATE_*). */
+    int      state;
+    /* When was current state entered (system time, ns)? */
+    uint64_t state_entry_time;
+    /*
+     * Time spent in each RUNSTATE_* (ns). The sum of these times is
+     * guaranteed not to drift from system time.
+     */
+    uint64_t time[4];
+};
+
 struct kvm_vcpu {
       struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -92,6 +134,21 @@ struct kvm_vcpu {
 #endif

       struct kvm_vcpu_arch arch;
+
+       /* Added data structures */
+       volatile unsigned int status;
+       bool                            is_running; /* TODO: these two
variables are identical! */
+
+       // struct hrtimer             timer;
+       struct task_struct*   thread;
+       cpumask_t           cpu_affinity;
+
+       unsigned long pause_flags;
+       atomic_t                pause_count;
+       struct vcpu_runstate_info runstate;
+       void    *sched_priv; /* scheduler-specific data */
+       bool        set_rt;
+       int     processor;  /* the processor that scheduler thinks */
 };

 struct kvm_memory_slot {
@@ -122,6 +179,10 @@ struct kvm_kernel_irq_routing_entry {
       struct list_head link;
 };

+typedef pid_t  vmid_t;
+#define IDLE_VM_ID     ((vmid_t)0x0ffffffff)
+#define HOST_VM_ID     1
+#define ANONY_VM_ID    0
 struct kvm {
       struct mutex lock; /* protects the vcpus array and APIC accesses */
       spinlock_t mmu_lock;
@@ -152,6 +213,12 @@ struct kvm {
       unsigned long mmu_notifier_seq;
       long mmu_notifier_count;
 #endif
+       void    *sched_priv;    /*scheduler specific data */
+       atomic_t                pause_count;
+       bool            is_paused_by_controller;
+       bool            is_dying;
+       vmid_t          vmid;
+       struct  list_head vm_link;
 };

 /* The guest did something we don't support. */
@@ -165,6 +232,12 @@ struct kvm {
 #define kvm_printf(kvm, fmt ...) printk(KERN_DEBUG fmt)
 #define vcpu_printf(vcpu, fmt...) kvm_printf(vcpu->kvm, fmt)

+static inline int vcpu_runnable(struct kvm_vcpu *v)
+{
+    return !(v->pause_flags |
+             atomic_read(&v->pause_count) |
+             atomic_read(&v->kvm->pause_count));
+}
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id);
 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);

@@ -178,6 +251,12 @@ void kvm_exit(void);
 void kvm_get_kvm(struct kvm *kvm);
 void kvm_put_kvm(struct kvm *kvm);

+extern long (*sched_setaffinity_p)(pid_t pid, cpumask_t* new_mask);
+static inline long kvm_sched_setaffinity(pid_t pid, cpumask_t new_mask)
+{
+    return  sched_setaffinity_p(pid, &new_mask);
+}
+
 #define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
 #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
 static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
@@ -524,4 +603,11 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {}

 #endif

+#define test_and_set_bool(b)   xchg(&(b), 1)
+#define test_and_clear_bool(b) xchg(&(b), 0)
+static inline s64  NOW(void)
+{
+    struct timespec t = current_kernel_time();
+    return timespec_to_ns(&t);
+}
 #endif
diff --git a/include/linux/sched-if.h b/include/linux/sched-if.h
new file mode 100644
index 0000000..d966cd3
--- /dev/null
+++ b/include/linux/sched-if.h
@@ -0,0 +1,225 @@
+#ifndef SCHED_IF_H
+#define SCHED_IF_H
+
+#include <linux/schedule.h>
+
+#if 0
+extern long (*sched_setaffinity_p)(pid_t pid, cpumask_t new_mask);
+static inline long kvm_sched_setaffinity(pid_t pid, cpumask_t new_mask)
+{
+    return  sched_setaffinity_p(pid, new_mask);
+}
+extern long (*sched_setaffinity_p)(pid_t pid, cpumask_t* in_mask);
+static inline long kvm_sched_setaffinity(pid_t pid, cpumask_t new_mask)
+{
+    return  sched_setaffinity_p(pid, &new_mask);
+}
+#endif
+
+#define IDLE_VM        ((unsigned int)(-1))
+#define NORMAL_VM      0
+#define HOST_VM                1
+
+#define MAX_PARAMS     1
+DECLARE_PER_CPU(rwlock_t, pseudo_cli);
+
+#ifdef CONFIG_PREEMPT
+#define thread_preemptible() (preempt_count() == 0)
+#else
+#define thread_preemptible()   0
+#endif
+
+static inline int vcpu_schedule_try_lock(struct kvm_vcpu *v)
+{
+    unsigned int cpu;
+    struct schedule_data *sd;
+
+    for ( ; ; ){
+       int r;
+       cpu = v->processor;
+       sd = &per_cpu(schedule_data, cpu);
+       r = spin_trylock(&sd->schedule_lock);
+       if (!r) return 0;
+       if (likely(v->processor == cpu))
+           return 1;
+       spin_unlock(&sd->schedule_lock);
+    }
+}
+static inline struct schedule_data* vcpu_schedule_lock(struct kvm_vcpu *v)
+{
+    unsigned int cpu;
+    struct schedule_data *sd;
+
+    for ( ; ; )
+    {
+       cpu = v->processor;
+       sd = &per_cpu(schedule_data, cpu);
+        spin_lock(&sd->schedule_lock);
+        if ( likely(v->processor == cpu) ){
+           return sd;
+       }
+        spin_unlock(&sd->schedule_lock);
+    }
+}
+
+static inline int pseudo_irq_cli(void)
+{
+    int cpu = raw_smp_processor_id();
+    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
+
+    tasklet_disable(&sd->sched_tasklet);
+    tasklet_disable(&sd->tick_tasklet);
+
+    /* maybe, the tasklet is already running now, we try to lock
+     * sched_state to detect this case
+     */
+    while(cmpxchg(&sd->sched_state,
+               SCHEDULER_FREE, SCHEDULER_USER) != SCHEDULER_FREE)
+       schedule();
+
+    return 1;
+}
+static inline int pseudo_irq_save(int flags)
+{
+    int cpu = raw_smp_processor_id();
+    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
+    if(thread_preemptible()){
+       BUG_ON(1);
+    };
+
+    tasklet_disable(&sd->sched_tasklet);
+    tasklet_disable(&sd->tick_tasklet);
+
+    /* maybe, the tasklet is already running now, we try to lock
+     * sched_state to detect this case
+     */
+    while(cmpxchg(&sd->sched_state,
+               SCHEDULER_FREE, SCHEDULER_USER) != SCHEDULER_FREE)
+       schedule();
+
+    return 1;
+}
+static inline void pseudo_irq_sti(void)
+{
+    int cpu = raw_smp_processor_id();
+    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
+    if(thread_preemptible()){
+       BUG_ON(1);
+    };
+    sd->sched_state = SCHEDULER_FREE;
+    barrier();
+    tasklet_enable(&sd->sched_tasklet);
+    tasklet_enable(&sd->tick_tasklet);
+}
+
+static inline void pseudo_irq_restore(int flags)
+{
+    pseudo_irq_sti();
+}
+
+#define vcpu_schedule_lock_irqsave(v, flags) do {  \
+    struct schedule_data *sd;                      \
+    int r = pseudo_irq_save(flags);   \
+    BUG_ON(thread_preemptible());                          \
+    if(!r) {                                       \
+       BUG_ON(1);                                  \
+    };                                             \
+    sd = vcpu_schedule_lock((v));          \
+} while ( 0 )
+
+#define vcpu_schedule_lock_irq(v) do {   \
+    struct schedule_data *sd;                      \
+    int r;                                         \
+    BUG_ON(thread_preemptible());                  \
+    r = pseudo_irq_cli();                          \
+    if(!r) {                                       \
+       BUG_ON(1);                                  \
+    };                                             \
+    sd = vcpu_schedule_lock((v));          \
+} while ( 0 )
+
+static inline void vcpu_schedule_unlock(struct kvm_vcpu *v)
+{
+    spin_unlock(&per_cpu(schedule_data, v->processor).schedule_lock);
+}
+
+#define vcpu_schedule_unlock_irq(v) do {  \
+    vcpu_schedule_unlock(v);                       \
+    pseudo_irq_sti();                      \
+} while ( 0 )
+#define vcpu_schedule_unlock_irqrestore(v, flags) do { \
+    vcpu_schedule_unlock(v);                       \
+    pseudo_irq_restore(flags);     \
+} while ( 0 )
+
+struct kvm;
+struct scheduler {
+    char *name;             /* full name for this scheduler      */
+    char *opt_name;         /* option name for this scheduler    */
+    unsigned int sched_id;  /* ID for this scheduler             */
+
+    void         (*init)           (void);
+
+    int          (*init_vm)    (struct kvm*);
+    void         (*destroy_vm) (struct kvm*);
+
+    int          (*init_vcpu)      (struct kvm_vcpu *);
+    void         (*destroy_vcpu)   (struct kvm_vcpu *);
+
+    void         (*sleep)          (struct kvm_vcpu *);
+    void         (*wake)           (struct kvm_vcpu *);
+
+    struct task_slice (*do_schedule) (s_time_t);
+
+    void       (*disable_scheduler) (int cpu);
+    int                (*start_scheduler) (int cpu);
+
+    void       (*stop_schedule) (int cpu);
+
+    int          (*pick_cpu)       (struct kvm_vcpu *);
+    int                 (*read_schedule_info) (struct kvm*, char*, int sz);
+    int                 (*write_schedule_info) (struct kvm*, char*);
+    void         (*dump_settings)  (void);
+    void         (*dump_cpu_state) (int);
+};
+
+extern struct kvm *idle_vm_kvm;
+extern struct kvm *host_vm_kvm;
+#define is_idle_vm(kvm)  ((kvm) == idle_vm_kvm)
+#define is_host_vm(kvm) ((kvm) == host_vm_kvm)
+
+extern bool shutting_down;
+
+#define is_idle_vcpu(vcpu) (is_idle_vm((vcpu)->kvm))
+#define is_host_vcpu(vcpu)  (is_host_vm((vcpu)->kvm))
+
+#define _VPF_blocked         0
+#define VPF_blocked          (1UL<<_VPF_blocked)
+
+ /* VCPU is offline. */
+#define _VPF_down            1
+#define VPF_down             (1UL<<_VPF_down)
+
+#define _VPF_migrating       3
+#define VPF_migrating        (1UL<<_VPF_migrating)
+
+extern void sched_destroy_vcpu(struct kvm_vcpu *v);
+extern void sched_destroy_vm(struct kvm *);
+extern int sched_init_vcpu(struct kvm_vcpu *v, unsigned int processor);
+extern int sched_init_vm(struct kvm *kvm);
+extern void vcpu_sleep_nosync(struct kvm_vcpu *v);
+extern void vcpu_sleep_sync(struct kvm_vcpu *v);
+extern void vcpu_wake(struct kvm_vcpu *v);
+extern void vm_pause(struct kvm *kvm);
+extern void vm_unpause(struct kvm *kvm);
+extern void scheduler_init(void);
+extern void wait_scheduler_stops(void);
+extern void scheduler_destroy(void);
+extern void scheduler_stop_tickers(void);
+extern void vm_pause_by_systemcontroller(struct kvm* kvm);
+extern void vm_unpause_by_systemcontroller(struct kvm* kvm);
+extern void stop_auto_schedule(void);
+extern void scheduler_start(void);
+
+#define current_vcpu (per_cpu(schedule_data, raw_smp_processor_id()).curr)
+#endif
diff --git a/include/linux/schedule.h b/include/linux/schedule.h
new file mode 100644
index 0000000..896da0a
--- /dev/null
+++ b/include/linux/schedule.h
@@ -0,0 +1,35 @@
+#ifndef _SCHEDULE_H
+#define _SCHEDULE_H
+#include <linux/interrupt.h>
+
+typedef s64 s_time_t;
+
+struct task_slice {
+    struct kvm_vcpu *task;
+    s_time_t     time;
+};
+
+#define SCHEDULER_FREE     0 /* none is using the scheduler        */
+#define SCHEDULER_USER     1 /* it is used by from user requirement*/
+#define SCHEDULER_KERNEL    2 /* it is used by scheduler or ticker  */
+
+struct schedule_data {
+    spinlock_t     schedule_lock;  /* spinlock protecting curr         */
+    struct kvm_vcpu *curr;         /* current task                     */
+    struct kvm_vcpu *idle;         /* idle task for this cpu           */
+    void           *sched_priv;
+    struct hrtimer  s_timer;       /* scheduling timer                 */
+    int                    id;             /* the cpu id
         */
+    struct hrtimer  watchdog;      /* the watchdog timer               */
+    struct tasklet_struct sched_tasklet;  /* per cpu schedule tasklet  */
+    wait_queue_head_t ipi_wq;      /* ipi helper thread waitqueue      */
+    volatile bool      ipi_quit;   /* the ipi helper should quit       */
+    struct tasklet_struct tick_tasklet; /* per cpu tick tasklet */
+    volatile int sched_state;      /* the scheduler status  */
+    volatile bool in_use;          /* indicate the whether the
schedule can work*/
+    volatile bool can_migrate;
+};
+
+DECLARE_PER_CPU(struct schedule_data, schedule_data);
+
+#endif
diff --git a/include/linux/trace.h b/include/linux/trace.h
new file mode 100755
index 0000000..3f64df6
--- /dev/null
+++ b/include/linux/trace.h
@@ -0,0 +1,60 @@
+#ifndef TRACE_H
+#define TRACE_H
+
+#define NR_TRACES   500
+struct t_rec {
+    u64        cycles; /* local(and global too) cpu tsc */
+    u32        event;  /* event id */
+    unsigned long data[5]; /* event data items */
+};
+struct trace_logger {
+    struct t_rec* buf;
+    spinlock_t lock;
+     int ptr;
+};
+#define WATCHDOG_NS    5000000000
+extern int init_trace_buf(int cpu);
+extern void free_trace_buf(int cpu);
+extern enum hrtimer_restart dump_traces(void*);
+extern enum hrtimer_restart dump_cpu_trace(struct hrtimer*);
+extern void trace(u32 event, unsigned long d0, unsigned long d1,
+       unsigned long d2, unsigned long d3, unsigned long d4);
+
+#define TRC_SCHED_VM_ADD    0
+#define TRC_SCHED_SLEEP            1
+#define TRC_SCHED_WAKE     2
+#define TRC_SCHED_YIELD            3
+#define TRC_SCHED_SWITCH_INFPREV    4
+#define TRC_SCHED_SWITCH_INFNEXT    5
+#define TRC_SCHED_SWITCH    6
+
+#define TRC_SCHED_TIMER            7
+#define TRC_SCHED_RELAYED_TIMER        8
+#define TRC_VCPU_SCHEDULE   9
+#define TRC_RUNQ_TICKLE                10
+#define TRC_TASKLET_SCHEDULE   11
+#define TRC_CSCHED_TICK                12
+#define TRC_CSCHED_SLEEP       13
+#define TRC_CSCHED_WAKE                14
+#define TRC_INTERNAL_CSCHED_TICK    15
+#define TRC_DO_PENDING         16
+#define TRC_PSEUDO_CLI         17
+#define TRC_PSEUDO_STI         18
+#define TRC_LOAD_BALANCE       19
+#define TRC_CSCHED_SCHEDULE    20
+#define TRC_PSEUDO_INTR                21
+#define TRC_PSEUDO_EOI         22
+#define TRC_PSEUDO_OPEN_INTR   23
+#define TRC_TIMER_FN           24
+#define TRC_SEND_IPI           25
+#define TRC_INJECT_PEND_IPI    26
+#define TRC_PEND_INTR          27
+#define TRC_RUNQ_SORT          28
+
+#define TRACE_0D(e)  trace((e), 0, 0, 0, 0, 0)
+#define TRACE_1D(e,d)  trace((e), (d), 0, 0, 0, 0)
+#define TRACE_2D(e,d1,d2)  trace((e), (d1), (d2), 0, 0, 0)
+#define TRACE_3D(e,d1,d2,d3)  trace((e), (d1), (d2), (d3), 0, 0)
+#define TRACE_4D(e,d1,d2,d3,d4)  trace((e), (d1), (d2), (d3), (d4), 0)
+#define TRACE_5D(e,d1,d2,d3,d4,d5)  trace((e), (d1), (d2), (d3), (d4), (d5))
+#endif