[1/1] Improved RTDS scheduler

Message ID	1451555121-8085-2-git-send-email-tiche@seas.upenn.edu (mailing list archive)
State	New, archived
Headers	show Return-Path: <xen-devel-bounces@lists.xen.org> From: Tianyang Chen <tiche@seas.upenn.edu> To: xen-devel@lists.xenproject.org Date: Thu, 31 Dec 2015 04:45:21 -0500 Message-Id: <1451555121-8085-2-git-send-email-tiche@seas.upenn.edu> In-Reply-To: <1451555121-8085-1-git-send-email-tiche@seas.upenn.edu> References: <1451555121-8085-1-git-send-email-tiche@seas.upenn.edu> Cc: ian.campbell@citrix.com, dario.faggioli@citrix.com, Tianyang Chen <tiche@seas.upenn.edu>, ian.jackson@eu.citrix.com, george.dunlap@citrix.com, Meng Xu <mengxu@cis.upenn.edu>, jbeulich@suse.com, Dagaen Golomb <dgolomb@seas.upenn.edu> Subject: [Xen-devel] [PATCH 1/1] Improved RTDS scheduler Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org

diff --git a/0000-cover-letter.patch b/0000-cover-letter.patch new file mode 100644 index 0000000..f5aca91 --- /dev/null +++ b/0000-cover-letter.patch @@ -0,0 +1,16 @@ +From 25ca27ea281885eb9873244a11f08e6987efb36e Mon Sep 17 00:00:00 2001 +From: Tianyang Chen <tiche@seas.upenn.edu> +Date: Thu, 31 Dec 2015 04:05:21 -0500 +Subject: [PATCH] *** SUBJECT HERE *** + +*** BLURB HERE *** + +Tianyang Chen (1): + Improved RTDS scheduler + + xen/common/sched_rt.c | 159 +++++++++++++++++++++++++++++-------------------- + 1 file changed, 95 insertions(+), 64 deletions(-) + +-- +1.7.9.5 + diff --git a/0001-Improved-RTDS-scheduler.patch b/0001-Improved-RTDS-scheduler.patch new file mode 100644 index 0000000..02cb1f7 --- /dev/null +++ b/0001-Improved-RTDS-scheduler.patch @@ -0,0 +1,280 @@ +From 25ca27ea281885eb9873244a11f08e6987efb36e Mon Sep 17 00:00:00 2001 +From: Tianyang Chen <tianyangpenn@gmail.com> +Date: Thu, 31 Dec 2015 01:55:19 -0500 +Subject: [PATCH] Improved RTDS scheduler + +Budget replenishment is now handled by a dedicated timer which is +triggered at the most imminent release time of all runnable vcpus. + +Signed-off-by: Tianyang Chen <tiche@seas.upenn.edu> +Signed-off-by: Meng Xu <mengxu@cis.upenn.edu> +Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu> +--- + xen/common/sched_rt.c | 159 +++++++++++++++++++++++++++++-------------------- + 1 file changed, 95 insertions(+), 64 deletions(-) + +diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c +index 4372486..d522272 100644 +--- a/xen/common/sched_rt.c ++++ b/xen/common/sched_rt.c +@@ -16,6 +16,7 @@ + #include <xen/delay.h> + #include <xen/event.h> + #include <xen/time.h> ++#include <xen/timer.h> + #include <xen/perfc.h> + #include <xen/sched-if.h> + #include <xen/softirq.h> +@@ -147,6 +148,16 @@ static unsigned int nr_rt_ops; + * Global lock is referenced by schedule_data.schedule_lock from all + * physical cpus. It can be grabbed via vcpu_schedule_lock_irq() + */ ++ ++/* dedicated timer for replenishment */ ++static struct timer repl_timer; ++ ++/* controls when to first start the timer*/ ++static int timer_started; ++ ++/* handler for the replenishment timer */ ++static void repl_handler(void *data); ++ + struct rt_private { + spinlock_t lock; /* the global coarse grand lock */ + struct list_head sdom; /* list of availalbe domains, used for dump */ +@@ -426,6 +437,7 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) + static int + rt_init(struct scheduler *ops) + { ++ const int cpu = smp_processor_id(); + struct rt_private *prv = xzalloc(struct rt_private); + + printk("Initializing RTDS scheduler\n" +@@ -454,6 +466,8 @@ rt_init(struct scheduler *ops) + + ops->sched_data = prv; + ++ init_timer(&repl_timer, repl_handler, ops, 0); ++ + return 0; + + no_mem: +@@ -473,6 +487,9 @@ rt_deinit(const struct scheduler *ops) + xfree(_cpumask_scratch); + _cpumask_scratch = NULL; + } ++ ++ kill_timer(&repl_timer); ++ + xfree(prv); + } + +@@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) + + /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */ + list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu); ++ ++ if(!timer_started) ++ { ++ /* the first vcpu starts the timer for the first time*/ ++ timer_started = 1; ++ set_timer(&repl_timer,svc->cur_deadline); ++ } + } + + /* +@@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t *mask) + } + + /* +- * Update vcpu's budget and +- * sort runq by insert the modifed vcpu back to runq +- * lock is grabbed before calling this function +- */ +-static void +-__repl_update(const struct scheduler *ops, s_time_t now) +-{ +- struct list_head *runq = rt_runq(ops); +- struct list_head *depletedq = rt_depletedq(ops); +- struct list_head *iter; +- struct list_head *tmp; +- struct rt_vcpu *svc = NULL; +- +- list_for_each_safe(iter, tmp, runq) +- { +- svc = __q_elem(iter); +- if ( now < svc->cur_deadline ) +- break; +- +- rt_update_deadline(now, svc); +- /* reinsert the vcpu if its deadline is updated */ +- __q_remove(svc); +- __runq_insert(ops, svc); +- } +- +- list_for_each_safe(iter, tmp, depletedq) +- { +- svc = __q_elem(iter); +- if ( now >= svc->cur_deadline ) +- { +- rt_update_deadline(now, svc); +- __q_remove(svc); /* remove from depleted queue */ +- __runq_insert(ops, svc); /* add to runq */ +- } +- } +-} +- +-/* + * schedule function for rt scheduler. + * The lock is already grabbed in schedule.c, no need to lock here + */ +@@ -848,7 +834,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched + /* burn_budget would return for IDLE VCPU */ + burn_budget(ops, scurr, now); + +- __repl_update(ops, now); + + if ( tasklet_work_scheduled ) + { +@@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched + } + } + +- ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */ ++ ret.time = snext->budget; /* invoke the scheduler next time */ + ret.task = snext->vcpu; + + /* TRACE */ +@@ -1033,10 +1018,6 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) + { + struct rt_vcpu * const svc = rt_vcpu(vc); + s_time_t now = NOW(); +- struct rt_private *prv = rt_priv(ops); +- struct rt_vcpu *snext = NULL; /* highest priority on RunQ */ +- struct rt_dom *sdom = NULL; +- cpumask_t *online; + + BUG_ON( is_idle_vcpu(vc) ); + +@@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) + /* insert svc to runq/depletedq because svc is not in queue now */ + __runq_insert(ops, svc); + +- __repl_update(ops, now); +- +- ASSERT(!list_empty(&prv->sdom)); +- sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem); +- online = cpupool_scheduler_cpumask(sdom->dom->cpupool); +- snext = __runq_pick(ops, online); /* pick snext from ALL valid cpus */ +- +- runq_tickle(ops, snext); ++ runq_tickle(ops, svc); + + return; + } +@@ -1094,10 +1068,6 @@ static void + rt_context_saved(const struct scheduler *ops, struct vcpu *vc) + { + struct rt_vcpu *svc = rt_vcpu(vc); +- struct rt_vcpu *snext = NULL; +- struct rt_dom *sdom = NULL; +- struct rt_private *prv = rt_priv(ops); +- cpumask_t *online; + spinlock_t *lock = vcpu_schedule_lock_irq(vc); + + clear_bit(__RTDS_scheduled, &svc->flags); +@@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc) + if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) && + likely(vcpu_runnable(vc)) ) + { ++ /* only insert the pre-empted vcpu back*/ + __runq_insert(ops, svc); +- __repl_update(ops, NOW()); +- +- ASSERT(!list_empty(&prv->sdom)); +- sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem); +- online = cpupool_scheduler_cpumask(sdom->dom->cpupool); +- snext = __runq_pick(ops, online); /* pick snext from ALL cpus */ +- +- runq_tickle(ops, snext); + } + out: + vcpu_schedule_unlock_irq(lock, vc); +@@ -1167,6 +1130,74 @@ rt_dom_cntl( + return rc; + } + ++static void repl_handler(void *data){ ++ unsigned long flags; ++ s_time_t now = NOW(); ++ s_time_t min_repl = LONG_MAX; /* max time used in comparison*/ ++ struct scheduler *ops = data; ++ struct rt_private *prv = rt_priv(ops); ++ struct list_head *runq = rt_runq(ops); ++ struct list_head *depletedq = rt_depletedq(ops); ++ struct list_head *iter; ++ struct list_head *tmp; ++ struct rt_vcpu *svc = NULL; ++ ++ spin_lock_irqsave(&prv->lock,flags); ++ ++ stop_timer(&repl_timer); ++ ++ list_for_each_safe(iter, tmp, runq) ++ { ++ svc = __q_elem(iter); ++ if ( now < svc->cur_deadline ) ++ break; ++ rt_update_deadline(now, svc); ++ /* scan the runq to find the min release time ++ * this happens when vcpus on runq miss deadline ++ */ ++ if( min_repl> svc->cur_deadline ) ++ { ++ min_repl = svc->cur_deadline; ++ } ++ /* reinsert the vcpu if its deadline is updated */ ++ __q_remove(svc); ++ __runq_insert(ops, svc); ++ } ++ ++ list_for_each_safe(iter, tmp, depletedq) ++ { ++ svc = __q_elem(iter); ++ if ( now >= svc->cur_deadline ) ++ { ++ rt_update_deadline(now, svc); ++ ++ /* scan the delp_q to find the minimal release time */ ++ if(min_repl > svc->cur_deadline) ++ { ++ min_repl = svc->cur_deadline; ++ } ++ __q_remove(svc); ++ __runq_insert(ops, svc); ++ runq_tickle(ops, svc); ++ } ++ } ++ ++ /* if timer was triggered but none of the vcpus ++ * need to be refilled, set the timer to be the ++ * default period + now ++ */ ++ if(min_repl == LONG_MAX) ++ { ++ set_timer(&repl_timer, now + RTDS_DEFAULT_PERIOD); ++ } ++ else ++ { ++ /* reprogram the timer using the most imminent release time*/ ++ set_timer(&repl_timer, min_repl); ++ } ++ spin_unlock_irqrestore(&prv->lock,flags); ++} ++ + static struct rt_private _rt_priv; + + const struct scheduler sched_rtds_def = { +-- +1.7.9.5 + diff --git a/bak b/bak new file mode 100644 index 0000000..fa7b197 --- /dev/null +++ b/bak @@ -0,0 +1,62 @@ +From 25ca27ea281885eb9873244a11f08e6987efb36e Mon Sep 17 00:00:00 2001 +From: Tianyang Chen <tiche@seas.upenn.edu> +Date: Thu, 31 Dec 2015 03:42:33 -0500 +Subject: [PATCH 0/1] Improved RTDS scheduler + +Current RTDS scheduler is time driven and is called every 1ms. During each scheduler call, the repl_update() scans both runq and depeletedq, which might not be necessary every 1ms. + +Since each vcpu is implemented as a deferable server, budget is preserved during its period and refilled in the next. It is not necessary to check every 1ms as the current design does. The replenishment is needed at the nearest next period(nearest current_deadline) of all runnable vcpus. + +This improved design tries to reduce scheduler invocation by using an event driven approach;rt_schedule() will return a value when the scheduler needs to be called next time. In addition, the sched_rt will have one dedicated timer to handle replenishment when necessary. In other words, the budget replenishment and scheduler decision(rt_schedule) are separated. + +Based on previous decision between Dario, Dagaen and Meng, the improved design can be implemented/modified as follows: + +rt_schedule(): picks the highest runnable vcpu based on cpu affinity and ret.time will be passed to schedule(). + +rt_vcpu_wake(): when a vcpu is awake, it tickles instead of picking one from runq. + +rt_context_saved(): when context switching is finished, the preempted vcpu will be put back into the runq. Picking from runq and tickling are removed. + +repl_handler(): a timer handler which is reprogrammed to fire at the nearest vcpu deadline to replenish vcpus on depeletedq while keeping the runq sorted. When the replenishment is done, each replenished vcpu in the runq should tickle a pcpu to see if it needs to preempt any running vcpus. + + +An extra field to record the last replenishing time will be added. + +schedule.c SCHEDULE_SOFTIRQ: + rt_schedule(): + [spin_lock] + burn_budget(scurr) + snext = runq_pick() + [spin_unlock] + + +sched_rt.c TIMER_SOFTIRQ + replenishment_timer_handler() + [spin_lock] + <for_each_depleted_vcpu(i, i.repl_time < NOW()) { + replenish(i) + runq_tickle(i) + }> + program_timer() + [spin_lock] + +The transient behavior should be noted. It happens between a vcpu tickles and a pcpu actually picks it. As previous discussions, this is unavoidable. + +Previous discussions: +http://lists.xenproject.org/archives/html/xen-devel/2015-06/msg02629.html + +Signed-off-by: Tianyang Chen <tiche@seas.upenn.edu> +Signed-off-by: Meng Xu <mengxu@cis.upenn.edu> +Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu> + + + +Tianyang Chen (1): + Improved RTDS scheduler + + xen/common/sched_rt.c | 159 +++++++++++++++++++++++++++++-------------------- + 1 file changed, 95 insertions(+), 64 deletions(-) + +-- +1.7.9.5 + diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 4372486..d522272 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -16,6 +16,7 @@ #include <xen/delay.h> #include <xen/event.h> #include <xen/time.h> +#include <xen/timer.h> #include <xen/perfc.h> #include <xen/sched-if.h> #include <xen/softirq.h> @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops; * Global lock is referenced by schedule_data.schedule_lock from all * physical cpus. It can be grabbed via vcpu_schedule_lock_irq() */ + +/* dedicated timer for replenishment */ +static struct timer repl_timer; + +/* controls when to first start the timer*/ +static int timer_started; + +/* handler for the replenishment timer */ +static void repl_handler(void *data); + struct rt_private { spinlock_t lock; /* the global coarse grand lock */ struct list_head sdom; /* list of availalbe domains, used for dump */ @@ -426,6 +437,7 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) static int rt_init(struct scheduler *ops) { + const int cpu = smp_processor_id(); struct rt_private *prv = xzalloc(struct rt_private); printk("Initializing RTDS scheduler\n" @@ -454,6 +466,8 @@ rt_init(struct scheduler *ops) ops->sched_data = prv; + init_timer(&repl_timer, repl_handler, ops, 0); + return 0; no_mem: @@ -473,6 +487,9 @@ rt_deinit(const struct scheduler *ops) xfree(_cpumask_scratch); _cpumask_scratch = NULL; } + + kill_timer(&repl_timer); + xfree(prv); } @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */ list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu); + + if(!timer_started) + { + /* the first vcpu starts the timer for the first time*/ + timer_started = 1; + set_timer(&repl_timer,svc->cur_deadline); + } } /* @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t *mask) } /* - * Update vcpu's budget and - * sort runq by insert the modifed vcpu back to runq - * lock is grabbed before calling this function - */ -static void -__repl_update(const struct scheduler *ops, s_time_t now) -{ - struct list_head *runq = rt_runq(ops); - struct list_head *depletedq = rt_depletedq(ops); - struct list_head *iter; - struct list_head *tmp; - struct rt_vcpu *svc = NULL; - - list_for_each_safe(iter, tmp, runq) - { - svc = __q_elem(iter); - if ( now < svc->cur_deadline ) - break; - - rt_update_deadline(now, svc); - /* reinsert the vcpu if its deadline is updated */ - __q_remove(svc); - __runq_insert(ops, svc); - } - - list_for_each_safe(iter, tmp, depletedq) - { - svc = __q_elem(iter); - if ( now >= svc->cur_deadline ) - { - rt_update_deadline(now, svc); - __q_remove(svc); /* remove from depleted queue */ - __runq_insert(ops, svc); /* add to runq */ - } - } -} - -/* * schedule function for rt scheduler. * The lock is already grabbed in schedule.c, no need to lock here */ @@ -848,7 +834,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched /* burn_budget would return for IDLE VCPU */ burn_budget(ops, scurr, now); - __repl_update(ops, now); if ( tasklet_work_scheduled ) { @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched } } - ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */ + ret.time = snext->budget; /* invoke the scheduler next time */ ret.task = snext->vcpu; /* TRACE */ @@ -1033,10 +1018,6 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) { struct rt_vcpu * const svc = rt_vcpu(vc); s_time_t now = NOW(); - struct rt_private *prv = rt_priv(ops); - struct rt_vcpu *snext = NULL; /* highest priority on RunQ */ - struct rt_dom *sdom = NULL; - cpumask_t *online; BUG_ON( is_idle_vcpu(vc) ); @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) /* insert svc to runq/depletedq because svc is not in queue now */ __runq_insert(ops, svc); - __repl_update(ops, now); - - ASSERT(!list_empty(&prv->sdom)); - sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem); - online = cpupool_scheduler_cpumask(sdom->dom->cpupool); - snext = __runq_pick(ops, online); /* pick snext from ALL valid cpus */ - - runq_tickle(ops, snext); + runq_tickle(ops, svc); return; } @@ -1094,10 +1068,6 @@ static void rt_context_saved(const struct scheduler *ops, struct vcpu *vc) { struct rt_vcpu *svc = rt_vcpu(vc); - struct rt_vcpu *snext = NULL; - struct rt_dom *sdom = NULL; - struct rt_private *prv = rt_priv(ops); - cpumask_t *online; spinlock_t *lock = vcpu_schedule_lock_irq(vc); clear_bit(__RTDS_scheduled, &svc->flags); @@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc) if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) && likely(vcpu_runnable(vc)) ) { + /* only insert the pre-empted vcpu back*/ __runq_insert(ops, svc); - __repl_update(ops, NOW()); - - ASSERT(!list_empty(&prv->sdom)); - sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem); - online = cpupool_scheduler_cpumask(sdom->dom->cpupool); - snext = __runq_pick(ops, online); /* pick snext from ALL cpus */ - - runq_tickle(ops, snext); } out: vcpu_schedule_unlock_irq(lock, vc); @@ -1167,6 +1130,74 @@ rt_dom_cntl( return rc; } +static void repl_handler(void *data){ + unsigned long flags; + s_time_t now = NOW(); + s_time_t min_repl = LONG_MAX; /* max time used in comparison*/ + struct scheduler *ops = data; + struct rt_private *prv = rt_priv(ops); + struct list_head *runq = rt_runq(ops); + struct list_head *depletedq = rt_depletedq(ops); + struct list_head *iter; + struct list_head *tmp; + struct rt_vcpu *svc = NULL; + + spin_lock_irqsave(&prv->lock,flags); + + stop_timer(&repl_timer); + + list_for_each_safe(iter, tmp, runq) + { + svc = __q_elem(iter); + if ( now < svc->cur_deadline ) + break; + rt_update_deadline(now, svc); + /* scan the runq to find the min release time + * this happens when vcpus on runq miss deadline + */ + if( min_repl> svc->cur_deadline ) + { + min_repl = svc->cur_deadline; + } + /* reinsert the vcpu if its deadline is updated */ + __q_remove(svc); + __runq_insert(ops, svc); + } + + list_for_each_safe(iter, tmp, depletedq) + { + svc = __q_elem(iter); + if ( now >= svc->cur_deadline ) + { + rt_update_deadline(now, svc); + + /* scan the delp_q to find the minimal release time */ + if(min_repl > svc->cur_deadline) + { + min_repl = svc->cur_deadline; + } + __q_remove(svc); + __runq_insert(ops, svc); + runq_tickle(ops, svc); + } + } + + /* if timer was triggered but none of the vcpus + * need to be refilled, set the timer to be the + * default period + now + */ + if(min_repl == LONG_MAX) + { + set_timer(&repl_timer, now + RTDS_DEFAULT_PERIOD); + } + else + { + /* reprogram the timer using the most imminent release time*/ + set_timer(&repl_timer, min_repl); + } + spin_unlock_irqrestore(&prv->lock,flags); +} + static struct rt_private _rt_priv; const struct scheduler sched_rtds_def = {

[1/1] Improved RTDS scheduler

Commit Message

Comments

Patch