diff mbox

[V2,1/1] Improved RTDS scheduler

Message ID 1451557219-8642-2-git-send-email-tiche@seas.upenn.edu (mailing list archive)
State New, archived
Headers show

Commit Message

Tianyang Chen Dec. 31, 2015, 10:20 a.m. UTC
Budget replenishment is now handled by a dedicated timer which is
triggered at the most imminent release time of all runnable vcpus.

Changes since V1:
    None

Signed-off-by: Tianyang Chen <tiche@seas.upenn.edu>
Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
---
 xen/common/sched_rt.c |  159 +++++++++++++++++++++++++++++--------------------
 1 file changed, 95 insertions(+), 64 deletions(-)

Comments

Meng Xu Dec. 31, 2015, 1:44 p.m. UTC | #1
On Thu, Dec 31, 2015 at 6:20 PM, Tianyang Chen <tiche@seas.upenn.edu> wrote:
>
> Budget replenishment is now handled by a dedicated timer which is
> triggered at the most imminent release time of all runnable vcpus.
>
> Changes since V1:
>     None
>
> Signed-off-by: Tianyang Chen <tiche@seas.upenn.edu>
> Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
> Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
> ---
>  xen/common/sched_rt.c |  159 +++++++++++++++++++++++++++++--------------------
>  1 file changed, 95 insertions(+), 64 deletions(-)
>
> diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
> index 4372486..d522272 100644
> --- a/xen/common/sched_rt.c
> +++ b/xen/common/sched_rt.c
> @@ -16,6 +16,7 @@
>  #include <xen/delay.h>
>  #include <xen/event.h>
>  #include <xen/time.h>
> +#include <xen/timer.h>
>  #include <xen/perfc.h>
>  #include <xen/sched-if.h>
>  #include <xen/softirq.h>
> @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops;
>   * Global lock is referenced by schedule_data.schedule_lock from all
>   * physical cpus. It can be grabbed via vcpu_schedule_lock_irq()
>   */
> +
> +/* dedicated timer for replenishment */
> +static struct timer repl_timer;
> +
> +/* controls when to first start the timer*/


missing a space between * and / at the end of this line.


>
> +static int timer_started;
> +
> +/* handler for the replenishment timer */
> +static void repl_handler(void *data);
> +
>  struct rt_private {
>      spinlock_t lock;            /* the global coarse grand lock */
>      struct list_head sdom;      /* list of availalbe domains, used for dump */
> @@ -426,6 +437,7 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
>  static int
>  rt_init(struct scheduler *ops)
>  {
> +    const int cpu = smp_processor_id();


Did the "cpu" be used? If not, please remove it here.


>
>      struct rt_private *prv = xzalloc(struct rt_private);
>
>      printk("Initializing RTDS scheduler\n"
> @@ -454,6 +466,8 @@ rt_init(struct scheduler *ops)
>
>      ops->sched_data = prv;
>
> +    init_timer(&repl_timer, repl_handler, ops, 0);
> +
>      return 0;
>
>   no_mem:
> @@ -473,6 +487,9 @@ rt_deinit(const struct scheduler *ops)
>          xfree(_cpumask_scratch);
>          _cpumask_scratch = NULL;
>      }
> +
> +    kill_timer(&repl_timer);
> +
>      xfree(prv);
>  }
>
> @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
>
>      /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */
>      list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
> +
> +    if(!timer_started)


space should be added..


>
> +    {
> +        /* the first vcpu starts the timer for the first time*/
> +        timer_started = 1;
> +        set_timer(&repl_timer,svc->cur_deadline);
> +    }
>  }
>
>  /*
> @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t *mask)
>  }
>
>  /*
> - * Update vcpu's budget and
> - * sort runq by insert the modifed vcpu back to runq
> - * lock is grabbed before calling this function
> - */
> -static void
> -__repl_update(const struct scheduler *ops, s_time_t now)
> -{
> -    struct list_head *runq = rt_runq(ops);
> -    struct list_head *depletedq = rt_depletedq(ops);
> -    struct list_head *iter;
> -    struct list_head *tmp;
> -    struct rt_vcpu *svc = NULL;
> -
> -    list_for_each_safe(iter, tmp, runq)
> -    {
> -        svc = __q_elem(iter);
> -        if ( now < svc->cur_deadline )
> -            break;
> -
> -        rt_update_deadline(now, svc);
> -        /* reinsert the vcpu if its deadline is updated */
> -        __q_remove(svc);
> -        __runq_insert(ops, svc);
> -    }
> -
> -    list_for_each_safe(iter, tmp, depletedq)
> -    {
> -        svc = __q_elem(iter);
> -        if ( now >= svc->cur_deadline )
> -        {
> -            rt_update_deadline(now, svc);
> -            __q_remove(svc); /* remove from depleted queue */
> -            __runq_insert(ops, svc); /* add to runq */
> -        }
> -    }
> -}
> -
> -/*
>   * schedule function for rt scheduler.
>   * The lock is already grabbed in schedule.c, no need to lock here
>   */
> @@ -848,7 +834,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>      /* burn_budget would return for IDLE VCPU */
>      burn_budget(ops, scurr, now);
>
> -    __repl_update(ops, now);
>
>      if ( tasklet_work_scheduled )
>      {
> @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>          }
>      }
>
> -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
> +    ret.time = snext->budget; /* invoke the scheduler next time */
>      ret.task = snext->vcpu;
>
>      /* TRACE */
> @@ -1033,10 +1018,6 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
>  {
>      struct rt_vcpu * const svc = rt_vcpu(vc);
>      s_time_t now = NOW();
> -    struct rt_private *prv = rt_priv(ops);
> -    struct rt_vcpu *snext = NULL; /* highest priority on RunQ */
> -    struct rt_dom *sdom = NULL;
> -    cpumask_t *online;
>
>      BUG_ON( is_idle_vcpu(vc) );
>
> @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
>      /* insert svc to runq/depletedq because svc is not in queue now */
>      __runq_insert(ops, svc);
>
> -    __repl_update(ops, now);
> -
> -    ASSERT(!list_empty(&prv->sdom));
> -    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
> -    online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
> -    snext = __runq_pick(ops, online); /* pick snext from ALL valid cpus */
> -
> -    runq_tickle(ops, snext);
> +    runq_tickle(ops, svc);
>
>      return;
>  }
> @@ -1094,10 +1068,6 @@ static void
>  rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
>  {
>      struct rt_vcpu *svc = rt_vcpu(vc);
> -    struct rt_vcpu *snext = NULL;
> -    struct rt_dom *sdom = NULL;
> -    struct rt_private *prv = rt_priv(ops);
> -    cpumask_t *online;
>      spinlock_t *lock = vcpu_schedule_lock_irq(vc);
>
>      clear_bit(__RTDS_scheduled, &svc->flags);
> @@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
>      if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
>           likely(vcpu_runnable(vc)) )
>      {
> +        /* only insert the pre-empted vcpu back*/
>          __runq_insert(ops, svc);
> -        __repl_update(ops, NOW());
> -
> -        ASSERT(!list_empty(&prv->sdom));
> -        sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
> -        online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
> -        snext = __runq_pick(ops, online); /* pick snext from ALL cpus */
> -
> -        runq_tickle(ops, snext);
>      }
>  out:
>      vcpu_schedule_unlock_irq(lock, vc);
> @@ -1167,6 +1130,74 @@ rt_dom_cntl(
>      return rc;
>  }
>
> +static void repl_handler(void *data){
> +    unsigned long flags;
> +    s_time_t now = NOW();
> +    s_time_t min_repl = LONG_MAX; /* max time used in comparison*/
> +    struct scheduler *ops = data;
> +    struct rt_private *prv = rt_priv(ops);
> +    struct list_head *runq = rt_runq(ops);
> +    struct list_head *depletedq = rt_depletedq(ops);
> +    struct list_head *iter;
> +    struct list_head *tmp;
> +    struct rt_vcpu *svc = NULL;
> +
> +    spin_lock_irqsave(&prv->lock,flags);
> +
> +    stop_timer(&repl_timer);


Move the stop_timer before spin_lock_irqsave() since no more than one
core will run this handler in the current design.

>
>
>
> +
> +    list_for_each_safe(iter, tmp, runq)
> +    {
> +        svc = __q_elem(iter);
> +        if ( now < svc->cur_deadline )
> +            break;
> +        rt_update_deadline(now, svc);
> +        /* scan the runq to find the min release time
> +         * this happens when vcpus on runq miss deadline
> +         */
> +        if( min_repl> svc->cur_deadline )
> +        {
> +            min_repl = svc->cur_deadline;
> +        }

The logic here is incorrect. The comment you put is correct, but the
code didn't reflect the comment.

When the timer's handler is called, now should be equal to min_repl,
shouldn't it?
If now = min_repl, this if statement will never be called because the
loop stops if ( now < svc->cur_deadline ).

Only after you have replenished the vcpus on runq and depletedq,
should you pick the earliest deadline of the front vcpus at the runq
and depletedq as the min_repl.

Please correct me  if I missed anything. :-)


>
> +        /* reinsert the vcpu if its deadline is updated */
> +        __q_remove(svc);
> +        __runq_insert(ops, svc);
> +    }
> +
> +    list_for_each_safe(iter, tmp, depletedq)
> +    {
> +        svc = __q_elem(iter);
> +        if ( now >= svc->cur_deadline )
> +        {
> +            rt_update_deadline(now, svc);
> +
> +            /* scan the delp_q to find the minimal release time */
> +            if(min_repl > svc->cur_deadline)
> +            {
> +                min_repl = svc->cur_deadline;
> +            }


Please see the above comment. It may have the same issue here.

In addition, the current running VCPUs on cores also need to replenish
their budgets at the beginning of their next periods. Those running
VCPUs are not in runq or depletedq in the current RTDS scheduler. So
this patch won't replenish the current running VCPUs' budget until
they are put back to runq or depletedq, which is incorrect. Think
about the following  scenario: a VCPU with very small period  has a
task always running on it to keep this vcpu always runnable; This VCPU
has its period equal to its budget. Suppose this VCPU is the only VCPU
on a 4 core machine. This VCPU should keep running on one core and
never be put back to runq. In the current code, this VCPU won't have
its budget replenished. Right? :-)

You can choose one of the following three approaches to fix it:
a) add the running VCPUs into the runq
b) use another queue to keep track of the current running VCPUs
c) just scan each core (in the cpupool) to get the current running VCPU on it.
I personally prefer c) but I'm looking forward to others' opinion. :-)

> +            __q_remove(svc);
> +            __runq_insert(ops, svc);
> +            runq_tickle(ops, svc);
> +        }
> +    }
> +
> +    /* if timer was triggered but none of the vcpus
> +     * need to be refilled, set the timer to be the
> +     * default period + now
> +     */


Hmm, when could this happen? If it could happen, please specify the situation.
Otherwise, it may be better to print a warning (or cause crash?) when
you set the timer to be default period + now?

I'm not quite sure which is better, warning or crash?


>
> +    if(min_repl == LONG_MAX)
> +    {
>
> +        set_timer(&repl_timer, now + RTDS_DEFAULT_PERIOD);
> +    }


Hmm,

when could this happen? The only situation in my mind is that there is
no vcpus in the cpupool. But if that is the case, shouldn't the timer
never be triggered? Then this handler will never be called.

> +    else
> +    {
> +        /* reprogram the timer using the most imminent release time*/
> +        set_timer(&repl_timer, min_repl);
> +    }
>
> +    spin_unlock_irqrestore(&prv->lock,flags);


Same here, need to move spin_unlock_ before the if statement.

> +}
> +
>  static struct rt_private _rt_priv;
>
>  const struct scheduler sched_rtds_def = {
> --
> 1.7.9.5
>

Thank you very much, Tianyang, for your hard working on this! The
cover letter is much much better and is very clear now.
Let's waiting for others' opinions and keep refining it until it gets
merged and further improve the efficiency of the current RTDS
scheduler. :-P


Best,

Meng

-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/
Tianyang Chen Jan. 6, 2016, 7:57 a.m. UTC | #2
On 12/31/2015 8:44 AM, Meng Xu wrote:
> On Thu, Dec 31, 2015 at 6:20 PM, Tianyang Chen <tiche@seas.upenn.edu> wrote:
>>
>> Budget replenishment is now handled by a dedicated timer which is
>> triggered at the most imminent release time of all runnable vcpus.
>>
>> Changes since V1:
>>      None
>>
>> Signed-off-by: Tianyang Chen <tiche@seas.upenn.edu>
>> Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
>> Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
>> ---
>>   xen/common/sched_rt.c |  159 +++++++++++++++++++++++++++++--------------------
>>   1 file changed, 95 insertions(+), 64 deletions(-)
>>
>> diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
>> index 4372486..d522272 100644
>> --- a/xen/common/sched_rt.c
>> +++ b/xen/common/sched_rt.c
>> @@ -16,6 +16,7 @@
>>   #include <xen/delay.h>
>>   #include <xen/event.h>
>>   #include <xen/time.h>
>> +#include <xen/timer.h>
>>   #include <xen/perfc.h>
>>   #include <xen/sched-if.h>
>>   #include <xen/softirq.h>
>> @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops;
>>    * Global lock is referenced by schedule_data.schedule_lock from all
>>    * physical cpus. It can be grabbed via vcpu_schedule_lock_irq()
>>    */
>> +
>> +/* dedicated timer for replenishment */
>> +static struct timer repl_timer;
>> +
>> +/* controls when to first start the timer*/
>
>
> missing a space between * and / at the end of this line.
>
>
>>
>> +static int timer_started;
>> +
>> +/* handler for the replenishment timer */
>> +static void repl_handler(void *data);
>> +
>>   struct rt_private {
>>       spinlock_t lock;            /* the global coarse grand lock */
>>       struct list_head sdom;      /* list of availalbe domains, used for dump */
>> @@ -426,6 +437,7 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
>>   static int
>>   rt_init(struct scheduler *ops)
>>   {
>> +    const int cpu = smp_processor_id();
>
>
> Did the "cpu" be used? If not, please remove it here.
>
>
>>
>>       struct rt_private *prv = xzalloc(struct rt_private);
>>
>>       printk("Initializing RTDS scheduler\n"
>> @@ -454,6 +466,8 @@ rt_init(struct scheduler *ops)
>>
>>       ops->sched_data = prv;
>>
>> +    init_timer(&repl_timer, repl_handler, ops, 0);
>> +
>>       return 0;
>>
>>    no_mem:
>> @@ -473,6 +487,9 @@ rt_deinit(const struct scheduler *ops)
>>           xfree(_cpumask_scratch);
>>           _cpumask_scratch = NULL;
>>       }
>> +
>> +    kill_timer(&repl_timer);
>> +
>>       xfree(prv);
>>   }
>>
>> @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
>>
>>       /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */
>>       list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
>> +
>> +    if(!timer_started)
>
>
> space should be added..

Thanks, Meng. I will fix that.

>
>
>>
>> +    {
>> +        /* the first vcpu starts the timer for the first time*/
>> +        timer_started = 1;
>> +        set_timer(&repl_timer,svc->cur_deadline);
>> +    }
>>   }
>>
>>   /*
>> @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t *mask)
>>   }
>>
>>   /*
>> - * Update vcpu's budget and
>> - * sort runq by insert the modifed vcpu back to runq
>> - * lock is grabbed before calling this function
>> - */
>> -static void
>> -__repl_update(const struct scheduler *ops, s_time_t now)
>> -{
>> -    struct list_head *runq = rt_runq(ops);
>> -    struct list_head *depletedq = rt_depletedq(ops);
>> -    struct list_head *iter;
>> -    struct list_head *tmp;
>> -    struct rt_vcpu *svc = NULL;
>> -
>> -    list_for_each_safe(iter, tmp, runq)
>> -    {
>> -        svc = __q_elem(iter);
>> -        if ( now < svc->cur_deadline )
>> -            break;
>> -
>> -        rt_update_deadline(now, svc);
>> -        /* reinsert the vcpu if its deadline is updated */
>> -        __q_remove(svc);
>> -        __runq_insert(ops, svc);
>> -    }
>> -
>> -    list_for_each_safe(iter, tmp, depletedq)
>> -    {
>> -        svc = __q_elem(iter);
>> -        if ( now >= svc->cur_deadline )
>> -        {
>> -            rt_update_deadline(now, svc);
>> -            __q_remove(svc); /* remove from depleted queue */
>> -            __runq_insert(ops, svc); /* add to runq */
>> -        }
>> -    }
>> -}
>> -
>> -/*
>>    * schedule function for rt scheduler.
>>    * The lock is already grabbed in schedule.c, no need to lock here
>>    */
>> @@ -848,7 +834,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>>       /* burn_budget would return for IDLE VCPU */
>>       burn_budget(ops, scurr, now);
>>
>> -    __repl_update(ops, now);
>>
>>       if ( tasklet_work_scheduled )
>>       {
>> @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>>           }
>>       }
>>
>> -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
>> +    ret.time = snext->budget; /* invoke the scheduler next time */
>>       ret.task = snext->vcpu;
>>
>>       /* TRACE */
>> @@ -1033,10 +1018,6 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
>>   {
>>       struct rt_vcpu * const svc = rt_vcpu(vc);
>>       s_time_t now = NOW();
>> -    struct rt_private *prv = rt_priv(ops);
>> -    struct rt_vcpu *snext = NULL; /* highest priority on RunQ */
>> -    struct rt_dom *sdom = NULL;
>> -    cpumask_t *online;
>>
>>       BUG_ON( is_idle_vcpu(vc) );
>>
>> @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
>>       /* insert svc to runq/depletedq because svc is not in queue now */
>>       __runq_insert(ops, svc);
>>
>> -    __repl_update(ops, now);
>> -
>> -    ASSERT(!list_empty(&prv->sdom));
>> -    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
>> -    online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
>> -    snext = __runq_pick(ops, online); /* pick snext from ALL valid cpus */
>> -
>> -    runq_tickle(ops, snext);
>> +    runq_tickle(ops, svc);
>>
>>       return;
>>   }
>> @@ -1094,10 +1068,6 @@ static void
>>   rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
>>   {
>>       struct rt_vcpu *svc = rt_vcpu(vc);
>> -    struct rt_vcpu *snext = NULL;
>> -    struct rt_dom *sdom = NULL;
>> -    struct rt_private *prv = rt_priv(ops);
>> -    cpumask_t *online;
>>       spinlock_t *lock = vcpu_schedule_lock_irq(vc);
>>
>>       clear_bit(__RTDS_scheduled, &svc->flags);
>> @@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
>>       if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
>>            likely(vcpu_runnable(vc)) )
>>       {
>> +        /* only insert the pre-empted vcpu back*/
>>           __runq_insert(ops, svc);
>> -        __repl_update(ops, NOW());
>> -
>> -        ASSERT(!list_empty(&prv->sdom));
>> -        sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
>> -        online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
>> -        snext = __runq_pick(ops, online); /* pick snext from ALL cpus */
>> -
>> -        runq_tickle(ops, snext);
>>       }
>>   out:
>>       vcpu_schedule_unlock_irq(lock, vc);
>> @@ -1167,6 +1130,74 @@ rt_dom_cntl(
>>       return rc;
>>   }
>>
>> +static void repl_handler(void *data){
>> +    unsigned long flags;
>> +    s_time_t now = NOW();
>> +    s_time_t min_repl = LONG_MAX; /* max time used in comparison*/
>> +    struct scheduler *ops = data;
>> +    struct rt_private *prv = rt_priv(ops);
>> +    struct list_head *runq = rt_runq(ops);
>> +    struct list_head *depletedq = rt_depletedq(ops);
>> +    struct list_head *iter;
>> +    struct list_head *tmp;
>> +    struct rt_vcpu *svc = NULL;
>> +
>> +    spin_lock_irqsave(&prv->lock,flags);
>> +
>> +    stop_timer(&repl_timer);
>
>
> Move the stop_timer before spin_lock_irqsave() since no more than one
> core will run this handler in the current design.

Right. We want the critical section as short as possible.

>
>>
>>
>>
>> +
>> +    list_for_each_safe(iter, tmp, runq)
>> +    {
>> +        svc = __q_elem(iter);
>> +        if ( now < svc->cur_deadline )
>> +            break;
>> +        rt_update_deadline(now, svc);
>> +        /* scan the runq to find the min release time
>> +         * this happens when vcpus on runq miss deadline
>> +         */
>> +        if( min_repl> svc->cur_deadline )
>> +        {
>> +            min_repl = svc->cur_deadline;
>> +        }
>
> The logic here is incorrect. The comment you put is correct, but the
> code didn't reflect the comment.
>
> When the timer's handler is called, now should be equal to min_repl,
> shouldn't it?
> If now = min_repl, this if statement will never be called because the
> loop stops if ( now < svc->cur_deadline ).
>

Yes the handler will be invoked when it was programmed to but the value 
of min_repl is set to be LONG_MAX initially. It doesn't preserve values. 
Therefore, the IF statement is feasible.

> Only after you have replenished the vcpus on runq and depletedq,
> should you pick the earliest deadline of the front vcpus at the runq
> and depletedq as the min_repl.
>
> Please correct me  if I missed anything. :-)
>
>
>>
>> +        /* reinsert the vcpu if its deadline is updated */
>> +        __q_remove(svc);
>> +        __runq_insert(ops, svc);
>> +    }
>> +
>> +    list_for_each_safe(iter, tmp, depletedq)
>> +    {
>> +        svc = __q_elem(iter);
>> +        if ( now >= svc->cur_deadline )
>> +        {
>> +            rt_update_deadline(now, svc);
>> +
>> +            /* scan the delp_q to find the minimal release time */
>> +            if(min_repl > svc->cur_deadline)
>> +            {
>> +                min_repl = svc->cur_deadline;
>> +            }
>
>
> Please see the above comment. It may have the same issue here.
>
> In addition, the current running VCPUs on cores also need to replenish
> their budgets at the beginning of their next periods. Those running
> VCPUs are not in runq or depletedq in the current RTDS scheduler. So
> this patch won't replenish the current running VCPUs' budget until
> they are put back to runq or depletedq, which is incorrect. Think
> about the following  scenario: a VCPU with very small period  has a
> task always running on it to keep this vcpu always runnable; This VCPU
> has its period equal to its budget. Suppose this VCPU is the only VCPU
> on a 4 core machine. This VCPU should keep running on one core and
> never be put back to runq. In the current code, this VCPU won't have
> its budget replenished. Right? :-)
>
> You can choose one of the following three approaches to fix it:
> a) add the running VCPUs into the runq
> b) use another queue to keep track of the current running VCPUs
> c) just scan each core (in the cpupool) to get the current running VCPU on it.
> I personally prefer c) but I'm looking forward to others' opinion. :-)
>
>> +            __q_remove(svc);
>> +            __runq_insert(ops, svc);
>> +            runq_tickle(ops, svc);
>> +        }
>> +    }
>> +
>> +    /* if timer was triggered but none of the vcpus
>> +     * need to be refilled, set the timer to be the
>> +     * default period + now
>> +     */
>
>
> Hmm, when could this happen? If it could happen, please specify the situation.
> Otherwise, it may be better to print a warning (or cause crash?) when
> you set the timer to be default period + now?
>
> I'm not quite sure which is better, warning or crash?
>
>
>>
>> +    if(min_repl == LONG_MAX)
>> +    {
>>
>> +        set_timer(&repl_timer, now + RTDS_DEFAULT_PERIOD);
>> +    }
>
>
> Hmm,
>
> when could this happen? The only situation in my mind is that there is
> no vcpus in the cpupool. But if that is the case, shouldn't the timer
> never be triggered? Then this handler will never be called.

This is actually still a question for me. When I was testing the code in 
a virtual machine, it happens when the system boots up. I am guessing 
it's related to the same problem in nested-virtualization (just 
guessing). When I tested it on a real machine, it locks up and won't 
even respond to any key-board events. After some debugging, it appeared 
that the timer was only triggered once and in the timer handler, none of 
the vcpus was replenished. After the the system boots up, I have not 
seen any debug prints indicating that the timer is not programmed based 
on the release time. Besides looking for why, I think we should set some 
maximum interval for the timer to do the replenishment.


>
>> +    else
>> +    {
>> +        /* reprogram the timer using the most imminent release time*/
>> +        set_timer(&repl_timer, min_repl);
>> +    }
>>
>> +    spin_unlock_irqrestore(&prv->lock,flags);
>
>
> Same here, need to move spin_unlock_ before the if statement.
>
>> +}
>> +
>>   static struct rt_private _rt_priv;
>>
>>   const struct scheduler sched_rtds_def = {
>> --
>> 1.7.9.5
>>
>
> Thank you very much, Tianyang, for your hard working on this! The
> cover letter is much much better and is very clear now.
> Let's waiting for others' opinions and keep refining it until it gets
> merged and further improve the efficiency of the current RTDS
> scheduler. :-P

Thanks Meng, I will try my best.

Tianyang Chen

>
>
> Best,
>
> Meng
>
> -----------
> Meng Xu
> PhD Student in Computer and Information Science
> University of Pennsylvania
> http://www.cis.upenn.edu/~mengxu/
>
Meng Xu Jan. 15, 2016, 3:33 p.m. UTC | #3
On Wed, Jan 6, 2016 at 2:57 AM, Tianyang Chen <tiche@seas.upenn.edu> wrote:
>
>
>
> On 12/31/2015 8:44 AM, Meng Xu wrote:
>>
>> On Thu, Dec 31, 2015 at 6:20 PM, Tianyang Chen <tiche@seas.upenn.edu> wrote:
>>>
>>>
>>> Budget replenishment is now handled by a dedicated timer which is
>>> triggered at the most imminent release time of all runnable vcpus.
>>>
>>> Changes since V1:
>>>      None
>>>
>>> Signed-off-by: Tianyang Chen <tiche@seas.upenn.edu>
>>> Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
>>> Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
>>> ---
>>>   xen/common/sched_rt.c |  159 +++++++++++++++++++++++++++++--------------------
>>>   1 file changed, 95 insertions(+), 64 deletions(-)
>>>
>>> diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
>>> index 4372486..d522272 100644
>>> --- a/xen/common/sched_rt.c
>>> +++ b/xen/common/sched_rt.c
>>> @@ -16,6 +16,7 @@
>>>   #include <xen/delay.h>
>>>   #include <xen/event.h>
>>>   #include <xen/time.h>
>>> +#include <xen/timer.h>
>>>   #include <xen/perfc.h>
>>>   #include <xen/sched-if.h>
>>>   #include <xen/softirq.h>
>>> @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops;
>>>    * Global lock is referenced by schedule_data.schedule_lock from all
>>>    * physical cpus. It can be grabbed via vcpu_schedule_lock_irq()
>>>    */
>>> +
>>> +/* dedicated timer for replenishment */
>>> +static struct timer repl_timer;
>>> +
>>> +/* controls when to first start the timer*/
>>
>>
>>
>> missing a space between * and / at the end of this line.
>>
>>
>>>
>>> +static int timer_started;
>>> +
>>> +/* handler for the replenishment timer */
>>> +static void repl_handler(void *data);
>>> +
>>>   struct rt_private {
>>>       spinlock_t lock;            /* the global coarse grand lock */
>>>       struct list_head sdom;      /* list of availalbe domains, used for dump */
>>> @@ -426,6 +437,7 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
>>>   static int
>>>   rt_init(struct scheduler *ops)
>>>   {
>>> +    const int cpu = smp_processor_id();
>>
>>
>>
>> Did the "cpu" be used? If not, please remove it here.
>>
>>
>>>
>>>       struct rt_private *prv = xzalloc(struct rt_private);
>>>
>>>       printk("Initializing RTDS scheduler\n"
>>> @@ -454,6 +466,8 @@ rt_init(struct scheduler *ops)
>>>
>>>       ops->sched_data = prv;
>>>
>>> +    init_timer(&repl_timer, repl_handler, ops, 0);
>>> +
>>>       return 0;
>>>
>>>    no_mem:
>>> @@ -473,6 +487,9 @@ rt_deinit(const struct scheduler *ops)
>>>           xfree(_cpumask_scratch);
>>>           _cpumask_scratch = NULL;
>>>       }
>>> +
>>> +    kill_timer(&repl_timer);
>>> +
>>>       xfree(prv);
>>>   }
>>>
>>> @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
>>>
>>>       /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */
>>>       list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
>>> +
>>> +    if(!timer_started)
>>
>>
>>
>> space should be added..
>
>
> Thanks, Meng. I will fix that.
>
>
>>
>>
>>>
>>> +    {
>>> +        /* the first vcpu starts the timer for the first time*/
>>> +        timer_started = 1;
>>> +        set_timer(&repl_timer,svc->cur_deadline);
>>> +    }
>>>   }
>>>
>>>   /*
>>> @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t *mask)
>>>   }
>>>
>>>   /*
>>> - * Update vcpu's budget and
>>> - * sort runq by insert the modifed vcpu back to runq
>>> - * lock is grabbed before calling this function
>>> - */
>>> -static void
>>> -__repl_update(const struct scheduler *ops, s_time_t now)
>>> -{
>>> -    struct list_head *runq = rt_runq(ops);
>>> -    struct list_head *depletedq = rt_depletedq(ops);
>>> -    struct list_head *iter;
>>> -    struct list_head *tmp;
>>> -    struct rt_vcpu *svc = NULL;
>>> -
>>> -    list_for_each_safe(iter, tmp, runq)
>>> -    {
>>> -        svc = __q_elem(iter);
>>> -        if ( now < svc->cur_deadline )
>>> -            break;
>>> -
>>> -        rt_update_deadline(now, svc);
>>> -        /* reinsert the vcpu if its deadline is updated */
>>> -        __q_remove(svc);
>>> -        __runq_insert(ops, svc);
>>> -    }
>>> -
>>> -    list_for_each_safe(iter, tmp, depletedq)
>>> -    {
>>> -        svc = __q_elem(iter);
>>> -        if ( now >= svc->cur_deadline )
>>> -        {
>>> -            rt_update_deadline(now, svc);
>>> -            __q_remove(svc); /* remove from depleted queue */
>>> -            __runq_insert(ops, svc); /* add to runq */
>>> -        }
>>> -    }
>>> -}
>>> -
>>> -/*
>>>    * schedule function for rt scheduler.
>>>    * The lock is already grabbed in schedule.c, no need to lock here
>>>    */
>>> @@ -848,7 +834,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>>>       /* burn_budget would return for IDLE VCPU */
>>>       burn_budget(ops, scurr, now);
>>>
>>> -    __repl_update(ops, now);
>>>
>>>       if ( tasklet_work_scheduled )
>>>       {
>>> @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>>>           }
>>>       }
>>>
>>> -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
>>> +    ret.time = snext->budget; /* invoke the scheduler next time */
>>>       ret.task = snext->vcpu;
>>>
>>>       /* TRACE */
>>> @@ -1033,10 +1018,6 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
>>>   {
>>>       struct rt_vcpu * const svc = rt_vcpu(vc);
>>>       s_time_t now = NOW();
>>> -    struct rt_private *prv = rt_priv(ops);
>>> -    struct rt_vcpu *snext = NULL; /* highest priority on RunQ */
>>> -    struct rt_dom *sdom = NULL;
>>> -    cpumask_t *online;
>>>
>>>       BUG_ON( is_idle_vcpu(vc) );
>>>
>>> @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
>>>       /* insert svc to runq/depletedq because svc is not in queue now */
>>>       __runq_insert(ops, svc);
>>>
>>> -    __repl_update(ops, now);
>>> -
>>> -    ASSERT(!list_empty(&prv->sdom));
>>> -    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
>>> -    online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
>>> -    snext = __runq_pick(ops, online); /* pick snext from ALL valid cpus */
>>> -
>>> -    runq_tickle(ops, snext);
>>> +    runq_tickle(ops, svc);
>>>
>>>       return;
>>>   }
>>> @@ -1094,10 +1068,6 @@ static void
>>>   rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
>>>   {
>>>       struct rt_vcpu *svc = rt_vcpu(vc);
>>> -    struct rt_vcpu *snext = NULL;
>>> -    struct rt_dom *sdom = NULL;
>>> -    struct rt_private *prv = rt_priv(ops);
>>> -    cpumask_t *online;
>>>       spinlock_t *lock = vcpu_schedule_lock_irq(vc);
>>>
>>>       clear_bit(__RTDS_scheduled, &svc->flags);
>>> @@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
>>>       if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
>>>            likely(vcpu_runnable(vc)) )
>>>       {
>>> +        /* only insert the pre-empted vcpu back*/
>>>           __runq_insert(ops, svc);
>>> -        __repl_update(ops, NOW());
>>> -
>>> -        ASSERT(!list_empty(&prv->sdom));
>>> -        sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
>>> -        online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
>>> -        snext = __runq_pick(ops, online); /* pick snext from ALL cpus */
>>> -
>>> -        runq_tickle(ops, snext);
>>>       }
>>>   out:
>>>       vcpu_schedule_unlock_irq(lock, vc);
>>> @@ -1167,6 +1130,74 @@ rt_dom_cntl(
>>>       return rc;
>>>   }
>>>
>>> +static void repl_handler(void *data){
>>> +    unsigned long flags;
>>> +    s_time_t now = NOW();
>>> +    s_time_t min_repl = LONG_MAX; /* max time used in comparison*/
>>> +    struct scheduler *ops = data;
>>> +    struct rt_private *prv = rt_priv(ops);
>>> +    struct list_head *runq = rt_runq(ops);
>>> +    struct list_head *depletedq = rt_depletedq(ops);
>>> +    struct list_head *iter;
>>> +    struct list_head *tmp;
>>> +    struct rt_vcpu *svc = NULL;
>>> +
>>> +    spin_lock_irqsave(&prv->lock,flags);
>>> +
>>> +    stop_timer(&repl_timer);
>>
>>
>>
>> Move the stop_timer before spin_lock_irqsave() since no more than one
>> core will run this handler in the current design.
>
>
> Right. We want the critical section as short as possible.
>
>>
>>>
>>>
>>>
>>> +
>>> +    list_for_each_safe(iter, tmp, runq)
>>> +    {
>>> +        svc = __q_elem(iter);
>>> +        if ( now < svc->cur_deadline )
>>> +            break;
>>> +        rt_update_deadline(now, svc);
>>> +        /* scan the runq to find the min release time
>>> +         * this happens when vcpus on runq miss deadline
>>> +         */
>>> +        if( min_repl> svc->cur_deadline )
>>> +        {
>>> +            min_repl = svc->cur_deadline;
>>> +        }
>>
>>
>> The logic here is incorrect. The comment you put is correct, but the
>> code didn't reflect the comment.
>>
>> When the timer's handler is called, now should be equal to min_repl,
>> shouldn't it?
>> If now = min_repl, this if statement will never be called because the
>> loop stops if ( now < svc->cur_deadline ).
>>
>
> Yes the handler will be invoked when it was programmed to but the value of min_repl is set to be LONG_MAX initially. It doesn't preserve values. Therefore, the IF statement is feasible.


Hmm, let's have a look  at the logic...

the min_repl. will only be possibly updated to svc->cur_deadline when
now >= svc->cur_deadline, right?

In other words, min_repl will  only be possibly updated to a time
point before now. Shouldn't the min_repl be the earliest time *in the
future* that the handler should be triggered? If not, what is the
definition/meaning of min_repl here?

At the end of the function, you did reprogram the timer to min_repl.
       /* reprogram the timer using the most imminent release time*/
       set_timer(&repl_timer, min_repl);

So I think the logic is incorrect here...

You should first update the budget and deadline of each VCPUs in runq,
depletedq and running VCPUs, and (then) find the earliest time in the
future when the handler should be triggered.

>
>
>
>> Only after you have replenished the vcpus on runq and depletedq,
>> should you pick the earliest deadline of the front vcpus at the runq
>> and depletedq as the min_repl.
>>
>> Please correct me  if I missed anything. :-)
>>
>>
>>>
>>> +        /* reinsert the vcpu if its deadline is updated */
>>> +        __q_remove(svc);
>>> +        __runq_insert(ops, svc);
>>> +    }
>>> +
>>> +    list_for_each_safe(iter, tmp, depletedq)
>>> +    {
>>> +        svc = __q_elem(iter);
>>> +        if ( now >= svc->cur_deadline )
>>> +        {
>>> +            rt_update_deadline(now, svc);
>>> +
>>> +            /* scan the delp_q to find the minimal release time */
>>> +            if(min_repl > svc->cur_deadline)
>>> +            {
>>> +                min_repl = svc->cur_deadline;
>>> +            }
>>
>>
>>
>> Please see the above comment. It may have the same issue here.
>>
>> In addition, the current running VCPUs on cores also need to replenish
>> their budgets at the beginning of their next periods. Those running
>> VCPUs are not in runq or depletedq in the current RTDS scheduler. So
>> this patch won't replenish the current running VCPUs' budget until
>> they are put back to runq or depletedq, which is incorrect. Think
>> about the following  scenario: a VCPU with very small period  has a
>> task always running on it to keep this vcpu always runnable; This VCPU
>> has its period equal to its budget. Suppose this VCPU is the only VCPU
>> on a 4 core machine. This VCPU should keep running on one core and
>> never be put back to runq. In the current code, this VCPU won't have
>> its budget replenished. Right? :-)
>>
>> You can choose one of the following three approaches to fix it:
>> a) add the running VCPUs into the runq
>> b) use another queue to keep track of the current running VCPUs
>> c) just scan each core (in the cpupool) to get the current running VCPU on it.
>> I personally prefer c) but I'm looking forward to others' opinion. :-)
>>
>>> +            __q_remove(svc);
>>> +            __runq_insert(ops, svc);
>>> +            runq_tickle(ops, svc);
>>> +        }
>>> +    }
>>> +
>>> +    /* if timer was triggered but none of the vcpus
>>> +     * need to be refilled, set the timer to be the
>>> +     * default period + now
>>> +     */
>>
>>
>>
>> Hmm, when could this happen? If it could happen, please specify the situation.
>> Otherwise, it may be better to print a warning (or cause crash?) when
>> you set the timer to be default period + now?
>>
>> I'm not quite sure which is better, warning or crash?
>>
>>
>>>
>>> +    if(min_repl == LONG_MAX)
>>> +    {
>>>
>>> +        set_timer(&repl_timer, now + RTDS_DEFAULT_PERIOD);
>>> +    }
>>
>>
>>
>> Hmm,
>>
>> when could this happen? The only situation in my mind is that there is
>> no vcpus in the cpupool. But if that is the case, shouldn't the timer
>> never be triggered? Then this handler will never be called.
>
>
> This is actually still a question for me. When I was testing the code in a virtual machine, it happens when the system boots up. I am guessing it's related to the same problem in nested-virtualization (just guessing). When I tested it on a real machine, it locks up and won't even respond to any key-board events.



First, we care about the real machine scenario. It should always run
well on real machine. The scheduler design didn't consider the
nest-virt. situation, so it is ok for now to have performance
degradation or issues in nest-virt. situation, IMHO.

Second, the reason why it locks up is probably because the timer of
the replenish handler is not correctly set up as I pointed out above.
So try to fix the above issue first and see if this still occurs.


>
> After some debugging, it appeared that the timer was only triggered once and in the timer handler, none of the vcpus was replenished.


Because you set the timer to a past time above.


>
> After the the system boots up, I have not seen any debug prints indicating that the timer is not programmed based on the release time. Besides looking for why, I think we should set some maximum interval for the timer to do the replenishment.


Well, this is not correct. Replenishment should happen at some
specific time, i.e., the start of a new period of any VCPU; It should
not be an arbitrary value. Otherwise, the VCPU budget and deadline
won't be updated correctly...

>
>
>
>
>>
>>> +    else
>>> +    {
>>> +        /* reprogram the timer using the most imminent release time*/
>>> +        set_timer(&repl_timer, min_repl);
>>> +    }
>>>
>>> +    spin_unlock_irqrestore(&prv->lock,flags);
>>
>>
>>
>> Same here, need to move spin_unlock_ before the if statement.
>>
>>> +}
>>> +
>>>   static struct rt_private _rt_priv;
>>>
>>>   const struct scheduler sched_rtds_def = {
>>> --
>>> 1.7.9.5
>>>

Best,

Meng



-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/
Tianyang Chen Jan. 19, 2016, 4:35 p.m. UTC | #4
On 1/15/2016 10:33 AM, Meng Xu wrote:
> On Wed, Jan 6, 2016 at 2:57 AM, Tianyang Chen <tiche@seas.upenn.edu> wrote:
>>
>>
>>
>> On 12/31/2015 8:44 AM, Meng Xu wrote:
>>>
>>> On Thu, Dec 31, 2015 at 6:20 PM, Tianyang Chen <tiche@seas.upenn.edu> wrote:
>>>>
>>>>
>>>> Budget replenishment is now handled by a dedicated timer which is
>>>> triggered at the most imminent release time of all runnable vcpus.
>>>>
>>>> Changes since V1:
>>>>       None
>>>>
>>>> Signed-off-by: Tianyang Chen <tiche@seas.upenn.edu>
>>>> Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
>>>> Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
>>>> ---
>>>>    xen/common/sched_rt.c |  159 +++++++++++++++++++++++++++++--------------------
>>>>    1 file changed, 95 insertions(+), 64 deletions(-)
>>>>
>>>> diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
>>>> index 4372486..d522272 100644
>>>> --- a/xen/common/sched_rt.c
>>>> +++ b/xen/common/sched_rt.c
>>>> @@ -16,6 +16,7 @@
>>>>    #include <xen/delay.h>
>>>>    #include <xen/event.h>
>>>>    #include <xen/time.h>
>>>> +#include <xen/timer.h>
>>>>    #include <xen/perfc.h>
>>>>    #include <xen/sched-if.h>
>>>>    #include <xen/softirq.h>
>>>> @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops;
>>>>     * Global lock is referenced by schedule_data.schedule_lock from all
>>>>     * physical cpus. It can be grabbed via vcpu_schedule_lock_irq()
>>>>     */
>>>> +
>>>> +/* dedicated timer for replenishment */
>>>> +static struct timer repl_timer;
>>>> +
>>>> +/* controls when to first start the timer*/
>>>
>>>
>>>
>>> missing a space between * and / at the end of this line.
>>>
>>>
>>>>
>>>> +static int timer_started;
>>>> +
>>>> +/* handler for the replenishment timer */
>>>> +static void repl_handler(void *data);
>>>> +
>>>>    struct rt_private {
>>>>        spinlock_t lock;            /* the global coarse grand lock */
>>>>        struct list_head sdom;      /* list of availalbe domains, used for dump */
>>>> @@ -426,6 +437,7 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
>>>>    static int
>>>>    rt_init(struct scheduler *ops)
>>>>    {
>>>> +    const int cpu = smp_processor_id();
>>>
>>>
>>>
>>> Did the "cpu" be used? If not, please remove it here.
>>>
>>>
>>>>
>>>>        struct rt_private *prv = xzalloc(struct rt_private);
>>>>
>>>>        printk("Initializing RTDS scheduler\n"
>>>> @@ -454,6 +466,8 @@ rt_init(struct scheduler *ops)
>>>>
>>>>        ops->sched_data = prv;
>>>>
>>>> +    init_timer(&repl_timer, repl_handler, ops, 0);
>>>> +
>>>>        return 0;
>>>>
>>>>     no_mem:
>>>> @@ -473,6 +487,9 @@ rt_deinit(const struct scheduler *ops)
>>>>            xfree(_cpumask_scratch);
>>>>            _cpumask_scratch = NULL;
>>>>        }
>>>> +
>>>> +    kill_timer(&repl_timer);
>>>> +
>>>>        xfree(prv);
>>>>    }
>>>>
>>>> @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
>>>>
>>>>        /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */
>>>>        list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
>>>> +
>>>> +    if(!timer_started)
>>>
>>>
>>>
>>> space should be added..
>>
>>
>> Thanks, Meng. I will fix that.
>>
>>
>>>
>>>
>>>>
>>>> +    {
>>>> +        /* the first vcpu starts the timer for the first time*/
>>>> +        timer_started = 1;
>>>> +        set_timer(&repl_timer,svc->cur_deadline);
>>>> +    }
>>>>    }
>>>>
>>>>    /*
>>>> @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t *mask)
>>>>    }
>>>>
>>>>    /*
>>>> - * Update vcpu's budget and
>>>> - * sort runq by insert the modifed vcpu back to runq
>>>> - * lock is grabbed before calling this function
>>>> - */
>>>> -static void
>>>> -__repl_update(const struct scheduler *ops, s_time_t now)
>>>> -{
>>>> -    struct list_head *runq = rt_runq(ops);
>>>> -    struct list_head *depletedq = rt_depletedq(ops);
>>>> -    struct list_head *iter;
>>>> -    struct list_head *tmp;
>>>> -    struct rt_vcpu *svc = NULL;
>>>> -
>>>> -    list_for_each_safe(iter, tmp, runq)
>>>> -    {
>>>> -        svc = __q_elem(iter);
>>>> -        if ( now < svc->cur_deadline )
>>>> -            break;
>>>> -
>>>> -        rt_update_deadline(now, svc);
>>>> -        /* reinsert the vcpu if its deadline is updated */
>>>> -        __q_remove(svc);
>>>> -        __runq_insert(ops, svc);
>>>> -    }
>>>> -
>>>> -    list_for_each_safe(iter, tmp, depletedq)
>>>> -    {
>>>> -        svc = __q_elem(iter);
>>>> -        if ( now >= svc->cur_deadline )
>>>> -        {
>>>> -            rt_update_deadline(now, svc);
>>>> -            __q_remove(svc); /* remove from depleted queue */
>>>> -            __runq_insert(ops, svc); /* add to runq */
>>>> -        }
>>>> -    }
>>>> -}
>>>> -
>>>> -/*
>>>>     * schedule function for rt scheduler.
>>>>     * The lock is already grabbed in schedule.c, no need to lock here
>>>>     */
>>>> @@ -848,7 +834,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>>>>        /* burn_budget would return for IDLE VCPU */
>>>>        burn_budget(ops, scurr, now);
>>>>
>>>> -    __repl_update(ops, now);
>>>>
>>>>        if ( tasklet_work_scheduled )
>>>>        {
>>>> @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>>>>            }
>>>>        }
>>>>
>>>> -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
>>>> +    ret.time = snext->budget; /* invoke the scheduler next time */
>>>>        ret.task = snext->vcpu;
>>>>
>>>>        /* TRACE */
>>>> @@ -1033,10 +1018,6 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
>>>>    {
>>>>        struct rt_vcpu * const svc = rt_vcpu(vc);
>>>>        s_time_t now = NOW();
>>>> -    struct rt_private *prv = rt_priv(ops);
>>>> -    struct rt_vcpu *snext = NULL; /* highest priority on RunQ */
>>>> -    struct rt_dom *sdom = NULL;
>>>> -    cpumask_t *online;
>>>>
>>>>        BUG_ON( is_idle_vcpu(vc) );
>>>>
>>>> @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
>>>>        /* insert svc to runq/depletedq because svc is not in queue now */
>>>>        __runq_insert(ops, svc);
>>>>
>>>> -    __repl_update(ops, now);
>>>> -
>>>> -    ASSERT(!list_empty(&prv->sdom));
>>>> -    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
>>>> -    online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
>>>> -    snext = __runq_pick(ops, online); /* pick snext from ALL valid cpus */
>>>> -
>>>> -    runq_tickle(ops, snext);
>>>> +    runq_tickle(ops, svc);
>>>>
>>>>        return;
>>>>    }
>>>> @@ -1094,10 +1068,6 @@ static void
>>>>    rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
>>>>    {
>>>>        struct rt_vcpu *svc = rt_vcpu(vc);
>>>> -    struct rt_vcpu *snext = NULL;
>>>> -    struct rt_dom *sdom = NULL;
>>>> -    struct rt_private *prv = rt_priv(ops);
>>>> -    cpumask_t *online;
>>>>        spinlock_t *lock = vcpu_schedule_lock_irq(vc);
>>>>
>>>>        clear_bit(__RTDS_scheduled, &svc->flags);
>>>> @@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
>>>>        if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
>>>>             likely(vcpu_runnable(vc)) )
>>>>        {
>>>> +        /* only insert the pre-empted vcpu back*/
>>>>            __runq_insert(ops, svc);
>>>> -        __repl_update(ops, NOW());
>>>> -
>>>> -        ASSERT(!list_empty(&prv->sdom));
>>>> -        sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
>>>> -        online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
>>>> -        snext = __runq_pick(ops, online); /* pick snext from ALL cpus */
>>>> -
>>>> -        runq_tickle(ops, snext);
>>>>        }
>>>>    out:
>>>>        vcpu_schedule_unlock_irq(lock, vc);
>>>> @@ -1167,6 +1130,74 @@ rt_dom_cntl(
>>>>        return rc;
>>>>    }
>>>>
>>>> +static void repl_handler(void *data){
>>>> +    unsigned long flags;
>>>> +    s_time_t now = NOW();
>>>> +    s_time_t min_repl = LONG_MAX; /* max time used in comparison*/
>>>> +    struct scheduler *ops = data;
>>>> +    struct rt_private *prv = rt_priv(ops);
>>>> +    struct list_head *runq = rt_runq(ops);
>>>> +    struct list_head *depletedq = rt_depletedq(ops);
>>>> +    struct list_head *iter;
>>>> +    struct list_head *tmp;
>>>> +    struct rt_vcpu *svc = NULL;
>>>> +
>>>> +    spin_lock_irqsave(&prv->lock,flags);
>>>> +
>>>> +    stop_timer(&repl_timer);
>>>
>>>
>>>
>>> Move the stop_timer before spin_lock_irqsave() since no more than one
>>> core will run this handler in the current design.
>>
>>
>> Right. We want the critical section as short as possible.
>>
>>>
>>>>
>>>>
>>>>
>>>> +
>>>> +    list_for_each_safe(iter, tmp, runq)
>>>> +    {
>>>> +        svc = __q_elem(iter);
>>>> +        if ( now < svc->cur_deadline )
>>>> +            break;
>>>> +        rt_update_deadline(now, svc);
>>>> +        /* scan the runq to find the min release time
>>>> +         * this happens when vcpus on runq miss deadline
>>>> +         */
>>>> +        if( min_repl> svc->cur_deadline )
>>>> +        {
>>>> +            min_repl = svc->cur_deadline;
>>>> +        }
>>>
>>>
>>> The logic here is incorrect. The comment you put is correct, but the
>>> code didn't reflect the comment.
>>>
>>> When the timer's handler is called, now should be equal to min_repl,
>>> shouldn't it?
>>> If now = min_repl, this if statement will never be called because the
>>> loop stops if ( now < svc->cur_deadline ).
>>>
>>
>> Yes the handler will be invoked when it was programmed to but the value of min_repl is set to be LONG_MAX initially. It doesn't preserve values. Therefore, the IF statement is feasible.
>
>
> Hmm, let's have a look  at the logic...
>
> the min_repl. will only be possibly updated to svc->cur_deadline when
> now >= svc->cur_deadline, right?
>
> In other words, min_repl will  only be possibly updated to a time
> point before now. Shouldn't the min_repl be the earliest time *in the
> future* that the handler should be triggered? If not, what is the
> definition/meaning of min_repl here?
>
> At the end of the function, you did reprogram the timer to min_repl.
>         /* reprogram the timer using the most imminent release time*/
>         set_timer(&repl_timer, min_repl);
>
> So I think the logic is incorrect here...
>
> You should first update the budget and deadline of each VCPUs in runq,
> depletedq and running VCPUs, and (then) find the earliest time in the
> future when the handler should be triggered.
>

Yeah. Although the next release time was actually found after a vcpu was 
replenished. If the runq still has vcpus pending but not missing a 
deadline, the break statement on the first iteration will not check the 
release time.

Since the runq is sorted, it's easier to just pick the next one after 
both runq and depletedq are updated.

>>
>>
>>
>>> Only after you have replenished the vcpus on runq and depletedq,
>>> should you pick the earliest deadline of the front vcpus at the runq
>>> and depletedq as the min_repl.
>>>
>>> Please correct me  if I missed anything. :-)
>>>
>>>
>>>>
>>>> +        /* reinsert the vcpu if its deadline is updated */
>>>> +        __q_remove(svc);
>>>> +        __runq_insert(ops, svc);
>>>> +    }
>>>> +
>>>> +    list_for_each_safe(iter, tmp, depletedq)
>>>> +    {
>>>> +        svc = __q_elem(iter);
>>>> +        if ( now >= svc->cur_deadline )
>>>> +        {
>>>> +            rt_update_deadline(now, svc);
>>>> +
>>>> +            /* scan the delp_q to find the minimal release time */
>>>> +            if(min_repl > svc->cur_deadline)
>>>> +            {
>>>> +                min_repl = svc->cur_deadline;
>>>> +            }
>>>
>>>
>>>
>>> Please see the above comment. It may have the same issue here.
>>>
>>> In addition, the current running VCPUs on cores also need to replenish
>>> their budgets at the beginning of their next periods. Those running
>>> VCPUs are not in runq or depletedq in the current RTDS scheduler. So
>>> this patch won't replenish the current running VCPUs' budget until
>>> they are put back to runq or depletedq, which is incorrect. Think
>>> about the following  scenario: a VCPU with very small period  has a
>>> task always running on it to keep this vcpu always runnable; This VCPU
>>> has its period equal to its budget. Suppose this VCPU is the only VCPU
>>> on a 4 core machine. This VCPU should keep running on one core and
>>> never be put back to runq. In the current code, this VCPU won't have
>>> its budget replenished. Right? :-)
>>>
>>> You can choose one of the following three approaches to fix it:
>>> a) add the running VCPUs into the runq
>>> b) use another queue to keep track of the current running VCPUs
>>> c) just scan each core (in the cpupool) to get the current running VCPU on it.
>>> I personally prefer c) but I'm looking forward to others' opinion. :-)
>>>

I experimented a little bit with adding a runningq to keep track of all 
running vcpus. A vcpu will be inserted when it's picked as snext in 
rt_schedule(). It will be removed from runningq after the context switch 
is finished in context_saved(). Also, a currently running vcpu could be 
marked as RUNSTATE_offline in sleep() and added back to runq in wake(). 
Does this sound reasonable if we were to use a q to keep track of 
running vcpus?

>>>> +            __q_remove(svc);
>>>> +            __runq_insert(ops, svc);
>>>> +            runq_tickle(ops, svc);
>>>> +        }
>>>> +    }
>>>> +
>>>> +    /* if timer was triggered but none of the vcpus
>>>> +     * need to be refilled, set the timer to be the
>>>> +     * default period + now
>>>> +     */
>>>
>>>
>>>
>>> Hmm, when could this happen? If it could happen, please specify the situation.
>>> Otherwise, it may be better to print a warning (or cause crash?) when
>>> you set the timer to be default period + now?
>>>
>>> I'm not quite sure which is better, warning or crash?
>>>
>>>
>>>>
>>>> +    if(min_repl == LONG_MAX)
>>>> +    {
>>>>
>>>> +        set_timer(&repl_timer, now + RTDS_DEFAULT_PERIOD);
>>>> +    }
>>>
>>>
>>>
>>> Hmm,
>>>
>>> when could this happen? The only situation in my mind is that there is
>>> no vcpus in the cpupool. But if that is the case, shouldn't the timer
>>> never be triggered? Then this handler will never be called.
>>
>>
>> This is actually still a question for me. When I was testing the code in a virtual machine, it happens when the system boots up. I am guessing it's related to the same problem in nested-virtualization (just guessing). When I tested it on a real machine, it locks up and won't even respond to any key-board events.
>
>
>
> First, we care about the real machine scenario. It should always run
> well on real machine. The scheduler design didn't consider the
> nest-virt. situation, so it is ok for now to have performance
> degradation or issues in nest-virt. situation, IMHO.
>
> Second, the reason why it locks up is probably because the timer of
> the replenish handler is not correctly set up as I pointed out above.
> So try to fix the above issue first and see if this still occurs.
>
Right, sometimes there is no vcpus on q at all because when the timer is 
programmed to trigger, a running vcpu hasn't been added back to the q 
yet. In this situation, correct me if I'm wrong, the repl_handler is 
called before the context switch is finished. However, scanning the 
currently running vcpus can fix this.
>
>>
>> After some debugging, it appeared that the timer was only triggered once and in the timer handler, none of the vcpus was replenished.
>
>
> Because you set the timer to a past time above.
>
>
>>
>> After the the system boots up, I have not seen any debug prints indicating that the timer is not programmed based on the release time. Besides looking for why, I think we should set some maximum interval for the timer to do the replenishment.
>
>
> Well, this is not correct. Replenishment should happen at some
> specific time, i.e., the start of a new period of any VCPU; It should
> not be an arbitrary value. Otherwise, the VCPU budget and deadline
> won't be updated correctly...
>
>>
>>
>>
>>
>>>
>>>> +    else
>>>> +    {
>>>> +        /* reprogram the timer using the most imminent release time*/
>>>> +        set_timer(&repl_timer, min_repl);
>>>> +    }
>>>>
>>>> +    spin_unlock_irqrestore(&prv->lock,flags);
>>>
>>>
>>>
>>> Same here, need to move spin_unlock_ before the if statement.
>>>
>>>> +}
>>>> +
>>>>    static struct rt_private _rt_priv;
>>>>
>>>>    const struct scheduler sched_rtds_def = {
>>>> --
>>>> 1.7.9.5
>>>>
>
> Best,
>
> Meng
>
>
>
> -----------
> Meng Xu
> PhD Student in Computer and Information Science
> University of Pennsylvania
> http://www.cis.upenn.edu/~mengxu/
>
Thanks,
Tianyang Chen
Meng Xu Jan. 19, 2016, 4:53 p.m. UTC | #5
On Tue, Jan 19, 2016 at 11:35 AM, Tianyang Chen <tiche@seas.upenn.edu> wrote:
>
>
>
> On 1/15/2016 10:33 AM, Meng Xu wrote:
>>
>> On Wed, Jan 6, 2016 at 2:57 AM, Tianyang Chen <tiche@seas.upenn.edu> wrote:
>>>
>>>
>>>
>>>
>>> On 12/31/2015 8:44 AM, Meng Xu wrote:
>>>>
>>>>
>>>> On Thu, Dec 31, 2015 at 6:20 PM, Tianyang Chen <tiche@seas.upenn.edu> wrote:
>>>>>
>>>>>
>>>>>
>>>>> Budget replenishment is now handled by a dedicated timer which is
>>>>> triggered at the most imminent release time of all runnable vcpus.
>>>>>
>>>>> Changes since V1:
>>>>>       None
>>>>>
>>>>> Signed-off-by: Tianyang Chen <tiche@seas.upenn.edu>
>>>>> Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
>>>>> Signed-off-by: Dagaen Golomb <dgolomb@seas.upenn.edu>
>>>>> ---
>>>>>    xen/common/sched_rt.c |  159 +++++++++++++++++++++++++++++--------------------
>>>>>    1 file changed, 95 insertions(+), 64 deletions(-)
>>>>>
>>>>> diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
>>>>> index 4372486..d522272 100644
>>>>> --- a/xen/common/sched_rt.c
>>>>> +++ b/xen/common/sched_rt.c
>>>>> @@ -16,6 +16,7 @@
>>>>>    #include <xen/delay.h>
>>>>>    #include <xen/event.h>
>>>>>    #include <xen/time.h>
>>>>> +#include <xen/timer.h>
>>>>>    #include <xen/perfc.h>
>>>>>    #include <xen/sched-if.h>
>>>>>    #include <xen/softirq.h>
>>>>> @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops;
>>>>>     * Global lock is referenced by schedule_data.schedule_lock from all
>>>>>     * physical cpus. It can be grabbed via vcpu_schedule_lock_irq()
>>>>>     */
>>>>> +
>>>>> +/* dedicated timer for replenishment */
>>>>> +static struct timer repl_timer;
>>>>> +
>>>>> +/* controls when to first start the timer*/
>>>>
>>>>
>>>>
>>>>
>>>> missing a space between * and / at the end of this line.
>>>>
>>>>
>>>>>
>>>>> +static int timer_started;
>>>>> +
>>>>> +/* handler for the replenishment timer */
>>>>> +static void repl_handler(void *data);
>>>>> +
>>>>>    struct rt_private {
>>>>>        spinlock_t lock;            /* the global coarse grand lock */
>>>>>        struct list_head sdom;      /* list of availalbe domains, used for dump */
>>>>> @@ -426,6 +437,7 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
>>>>>    static int
>>>>>    rt_init(struct scheduler *ops)
>>>>>    {
>>>>> +    const int cpu = smp_processor_id();
>>>>
>>>>
>>>>
>>>>
>>>> Did the "cpu" be used? If not, please remove it here.
>>>>
>>>>
>>>>>
>>>>>        struct rt_private *prv = xzalloc(struct rt_private);
>>>>>
>>>>>        printk("Initializing RTDS scheduler\n"
>>>>> @@ -454,6 +466,8 @@ rt_init(struct scheduler *ops)
>>>>>
>>>>>        ops->sched_data = prv;
>>>>>
>>>>> +    init_timer(&repl_timer, repl_handler, ops, 0);
>>>>> +
>>>>>        return 0;
>>>>>
>>>>>     no_mem:
>>>>> @@ -473,6 +487,9 @@ rt_deinit(const struct scheduler *ops)
>>>>>            xfree(_cpumask_scratch);
>>>>>            _cpumask_scratch = NULL;
>>>>>        }
>>>>> +
>>>>> +    kill_timer(&repl_timer);
>>>>> +
>>>>>        xfree(prv);
>>>>>    }
>>>>>
>>>>> @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
>>>>>
>>>>>        /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */
>>>>>        list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
>>>>> +
>>>>> +    if(!timer_started)
>>>>
>>>>
>>>>
>>>>
>>>> space should be added..
>>>
>>>
>>>
>>> Thanks, Meng. I will fix that.
>>>
>>>
>>>>
>>>>
>>>>>
>>>>> +    {
>>>>> +        /* the first vcpu starts the timer for the first time*/
>>>>> +        timer_started = 1;
>>>>> +        set_timer(&repl_timer,svc->cur_deadline);
>>>>> +    }
>>>>>    }
>>>>>
>>>>>    /*
>>>>> @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t *mask)
>>>>>    }
>>>>>
>>>>>    /*
>>>>> - * Update vcpu's budget and
>>>>> - * sort runq by insert the modifed vcpu back to runq
>>>>> - * lock is grabbed before calling this function
>>>>> - */
>>>>> -static void
>>>>> -__repl_update(const struct scheduler *ops, s_time_t now)
>>>>> -{
>>>>> -    struct list_head *runq = rt_runq(ops);
>>>>> -    struct list_head *depletedq = rt_depletedq(ops);
>>>>> -    struct list_head *iter;
>>>>> -    struct list_head *tmp;
>>>>> -    struct rt_vcpu *svc = NULL;
>>>>> -
>>>>> -    list_for_each_safe(iter, tmp, runq)
>>>>> -    {
>>>>> -        svc = __q_elem(iter);
>>>>> -        if ( now < svc->cur_deadline )
>>>>> -            break;
>>>>> -
>>>>> -        rt_update_deadline(now, svc);
>>>>> -        /* reinsert the vcpu if its deadline is updated */
>>>>> -        __q_remove(svc);
>>>>> -        __runq_insert(ops, svc);
>>>>> -    }
>>>>> -
>>>>> -    list_for_each_safe(iter, tmp, depletedq)
>>>>> -    {
>>>>> -        svc = __q_elem(iter);
>>>>> -        if ( now >= svc->cur_deadline )
>>>>> -        {
>>>>> -            rt_update_deadline(now, svc);
>>>>> -            __q_remove(svc); /* remove from depleted queue */
>>>>> -            __runq_insert(ops, svc); /* add to runq */
>>>>> -        }
>>>>> -    }
>>>>> -}
>>>>> -
>>>>> -/*
>>>>>     * schedule function for rt scheduler.
>>>>>     * The lock is already grabbed in schedule.c, no need to lock here
>>>>>     */
>>>>> @@ -848,7 +834,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>>>>>        /* burn_budget would return for IDLE VCPU */
>>>>>        burn_budget(ops, scurr, now);
>>>>>
>>>>> -    __repl_update(ops, now);
>>>>>
>>>>>        if ( tasklet_work_scheduled )
>>>>>        {
>>>>> @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
>>>>>            }
>>>>>        }
>>>>>
>>>>> -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
>>>>> +    ret.time = snext->budget; /* invoke the scheduler next time */
>>>>>        ret.task = snext->vcpu;
>>>>>
>>>>>        /* TRACE */
>>>>> @@ -1033,10 +1018,6 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
>>>>>    {
>>>>>        struct rt_vcpu * const svc = rt_vcpu(vc);
>>>>>        s_time_t now = NOW();
>>>>> -    struct rt_private *prv = rt_priv(ops);
>>>>> -    struct rt_vcpu *snext = NULL; /* highest priority on RunQ */
>>>>> -    struct rt_dom *sdom = NULL;
>>>>> -    cpumask_t *online;
>>>>>
>>>>>        BUG_ON( is_idle_vcpu(vc) );
>>>>>
>>>>> @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
>>>>>        /* insert svc to runq/depletedq because svc is not in queue now */
>>>>>        __runq_insert(ops, svc);
>>>>>
>>>>> -    __repl_update(ops, now);
>>>>> -
>>>>> -    ASSERT(!list_empty(&prv->sdom));
>>>>> -    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
>>>>> -    online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
>>>>> -    snext = __runq_pick(ops, online); /* pick snext from ALL valid cpus */
>>>>> -
>>>>> -    runq_tickle(ops, snext);
>>>>> +    runq_tickle(ops, svc);
>>>>>
>>>>>        return;
>>>>>    }
>>>>> @@ -1094,10 +1068,6 @@ static void
>>>>>    rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
>>>>>    {
>>>>>        struct rt_vcpu *svc = rt_vcpu(vc);
>>>>> -    struct rt_vcpu *snext = NULL;
>>>>> -    struct rt_dom *sdom = NULL;
>>>>> -    struct rt_private *prv = rt_priv(ops);
>>>>> -    cpumask_t *online;
>>>>>        spinlock_t *lock = vcpu_schedule_lock_irq(vc);
>>>>>
>>>>>        clear_bit(__RTDS_scheduled, &svc->flags);
>>>>> @@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
>>>>>        if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
>>>>>             likely(vcpu_runnable(vc)) )
>>>>>        {
>>>>> +        /* only insert the pre-empted vcpu back*/
>>>>>            __runq_insert(ops, svc);
>>>>> -        __repl_update(ops, NOW());
>>>>> -
>>>>> -        ASSERT(!list_empty(&prv->sdom));
>>>>> -        sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
>>>>> -        online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
>>>>> -        snext = __runq_pick(ops, online); /* pick snext from ALL cpus */
>>>>> -
>>>>> -        runq_tickle(ops, snext);
>>>>>        }
>>>>>    out:
>>>>>        vcpu_schedule_unlock_irq(lock, vc);
>>>>> @@ -1167,6 +1130,74 @@ rt_dom_cntl(
>>>>>        return rc;
>>>>>    }
>>>>>
>>>>> +static void repl_handler(void *data){
>>>>> +    unsigned long flags;
>>>>> +    s_time_t now = NOW();
>>>>> +    s_time_t min_repl = LONG_MAX; /* max time used in comparison*/
>>>>> +    struct scheduler *ops = data;
>>>>> +    struct rt_private *prv = rt_priv(ops);
>>>>> +    struct list_head *runq = rt_runq(ops);
>>>>> +    struct list_head *depletedq = rt_depletedq(ops);
>>>>> +    struct list_head *iter;
>>>>> +    struct list_head *tmp;
>>>>> +    struct rt_vcpu *svc = NULL;
>>>>> +
>>>>> +    spin_lock_irqsave(&prv->lock,flags);
>>>>> +
>>>>> +    stop_timer(&repl_timer);
>>>>
>>>>
>>>>
>>>>
>>>> Move the stop_timer before spin_lock_irqsave() since no more than one
>>>> core will run this handler in the current design.
>>>
>>>
>>>
>>> Right. We want the critical section as short as possible.
>>>
>>>>
>>>>>
>>>>>
>>>>>
>>>>> +
>>>>> +    list_for_each_safe(iter, tmp, runq)
>>>>> +    {
>>>>> +        svc = __q_elem(iter);
>>>>> +        if ( now < svc->cur_deadline )
>>>>> +            break;
>>>>> +        rt_update_deadline(now, svc);
>>>>> +        /* scan the runq to find the min release time
>>>>> +         * this happens when vcpus on runq miss deadline
>>>>> +         */
>>>>> +        if( min_repl> svc->cur_deadline )
>>>>> +        {
>>>>> +            min_repl = svc->cur_deadline;
>>>>> +        }
>>>>
>>>>
>>>>
>>>> The logic here is incorrect. The comment you put is correct, but the
>>>> code didn't reflect the comment.
>>>>
>>>> When the timer's handler is called, now should be equal to min_repl,
>>>> shouldn't it?
>>>> If now = min_repl, this if statement will never be called because the
>>>> loop stops if ( now < svc->cur_deadline ).
>>>>
>>>
>>> Yes the handler will be invoked when it was programmed to but the value of min_repl is set to be LONG_MAX initially. It doesn't preserve values. Therefore, the IF statement is feasible.
>>
>>
>>
>> Hmm, let's have a look  at the logic...
>>
>> the min_repl. will only be possibly updated to svc->cur_deadline when
>> now >= svc->cur_deadline, right?
>>
>> In other words, min_repl will  only be possibly updated to a time
>> point before now. Shouldn't the min_repl be the earliest time *in the
>> future* that the handler should be triggered? If not, what is the
>> definition/meaning of min_repl here?
>>
>> At the end of the function, you did reprogram the timer to min_repl.
>>         /* reprogram the timer using the most imminent release time*/
>>         set_timer(&repl_timer, min_repl);
>>
>> So I think the logic is incorrect here...
>>
>> You should first update the budget and deadline of each VCPUs in runq,
>> depletedq and running VCPUs, and (then) find the earliest time in the
>> future when the handler should be triggered.
>>
>
> Yeah. Although the next release time was actually found after a vcpu was replenished. If the runq still has vcpus pending but not missing a deadline, the break statement on the first iteration will not check the release time.
>
> Since the runq is sorted, it's easier to just pick the next one after both runq and depletedq are updated.
>
>
>>>
>>>
>>>
>>>> Only after you have replenished the vcpus on runq and depletedq,
>>>> should you pick the earliest deadline of the front vcpus at the runq
>>>> and depletedq as the min_repl.
>>>>
>>>> Please correct me  if I missed anything. :-)
>>>>
>>>>
>>>>>
>>>>> +        /* reinsert the vcpu if its deadline is updated */
>>>>> +        __q_remove(svc);
>>>>> +        __runq_insert(ops, svc);
>>>>> +    }
>>>>> +
>>>>> +    list_for_each_safe(iter, tmp, depletedq)
>>>>> +    {
>>>>> +        svc = __q_elem(iter);
>>>>> +        if ( now >= svc->cur_deadline )
>>>>> +        {
>>>>> +            rt_update_deadline(now, svc);
>>>>> +
>>>>> +            /* scan the delp_q to find the minimal release time */
>>>>> +            if(min_repl > svc->cur_deadline)
>>>>> +            {
>>>>> +                min_repl = svc->cur_deadline;
>>>>> +            }
>>>>
>>>>
>>>>
>>>>
>>>> Please see the above comment. It may have the same issue here.
>>>>
>>>> In addition, the current running VCPUs on cores also need to replenish
>>>> their budgets at the beginning of their next periods. Those running
>>>> VCPUs are not in runq or depletedq in the current RTDS scheduler. So
>>>> this patch won't replenish the current running VCPUs' budget until
>>>> they are put back to runq or depletedq, which is incorrect. Think
>>>> about the following  scenario: a VCPU with very small period  has a
>>>> task always running on it to keep this vcpu always runnable; This VCPU
>>>> has its period equal to its budget. Suppose this VCPU is the only VCPU
>>>> on a 4 core machine. This VCPU should keep running on one core and
>>>> never be put back to runq. In the current code, this VCPU won't have
>>>> its budget replenished. Right? :-)
>>>>
>>>> You can choose one of the following three approaches to fix it:
>>>> a) add the running VCPUs into the runq
>>>> b) use another queue to keep track of the current running VCPUs
>>>> c) just scan each core (in the cpupool) to get the current running VCPU on it.
>>>> I personally prefer c) but I'm looking forward to others' opinion. :-)
>>>>
>
> I experimented a little bit with adding a runningq to keep track of all running vcpus. A vcpu will be inserted when it's picked as snext in rt_schedule(). It will be removed from runningq after the context switch is finished in context_saved(). Also, a currently running vcpu could be marked as RUNSTATE_offline in sleep() and added back to runq in wake(). Does this sound reasonable if we were to use a q to keep track of running vcpus?


I think so. The name of that queue can be runningq, IMHO.


>
>
>
>>>>> +            __q_remove(svc);
>>>>> +            __runq_insert(ops, svc);
>>>>> +            runq_tickle(ops, svc);
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>> +    /* if timer was triggered but none of the vcpus
>>>>> +     * need to be refilled, set the timer to be the
>>>>> +     * default period + now
>>>>> +     */
>>>>
>>>>
>>>>
>>>>
>>>> Hmm, when could this happen? If it could happen, please specify the situation.
>>>> Otherwise, it may be better to print a warning (or cause crash?) when
>>>> you set the timer to be default period + now?
>>>>
>>>> I'm not quite sure which is better, warning or crash?
>>>>
>>>>
>>>>>
>>>>> +    if(min_repl == LONG_MAX)
>>>>> +    {
>>>>>
>>>>> +        set_timer(&repl_timer, now + RTDS_DEFAULT_PERIOD);
>>>>> +    }
>>>>
>>>>
>>>>
>>>>
>>>> Hmm,
>>>>
>>>> when could this happen? The only situation in my mind is that there is
>>>> no vcpus in the cpupool. But if that is the case, shouldn't the timer
>>>> never be triggered? Then this handler will never be called.
>>>
>>>
>>>
>>> This is actually still a question for me. When I was testing the code in a virtual machine, it happens when the system boots up. I am guessing it's related to the same problem in nested-virtualization (just guessing). When I tested it on a real machine, it locks up and won't even respond to any key-board events.
>>
>>
>>
>>
>> First, we care about the real machine scenario. It should always run
>> well on real machine. The scheduler design didn't consider the
>> nest-virt. situation, so it is ok for now to have performance
>> degradation or issues in nest-virt. situation, IMHO.
>>
>> Second, the reason why it locks up is probably because the timer of
>> the replenish handler is not correctly set up as I pointed out above.
>> So try to fix the above issue first and see if this still occurs.
>>
> Right, sometimes there is no vcpus on q at all because when the timer is programmed to trigger, a running vcpu hasn't been added back to the q yet. In this situation, correct me if I'm wrong, the repl_handler is called before the context switch is finished. However, scanning the currently running vcpus can fix this.


Yes.

Meng
Dario Faggioli Jan. 25, 2016, 9 a.m. UTC | #6
On Thu, 2015-12-31 at 05:20 -0500, Tianyang Chen wrote:

> @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops;
>   * Global lock is referenced by schedule_data.schedule_lock from all
>   * physical cpus. It can be grabbed via vcpu_schedule_lock_irq()
>   */
> +
> +/* dedicated timer for replenishment */
> +static struct timer repl_timer;
> +
So, there's always only one timer... Even if we have multiple cpupool
with RTDS as their scheduler, they share the replenishment timer? I
think it makes more sense to make this per-scheduler.

> +/* controls when to first start the timer*/
> +static int timer_started;
> +
I don't like this, and I don't think we need it. In fact, you removed
it yourself from v3, AFAICT.

> @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops,
> struct vcpu *vc)
>  
>      /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */
>      list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
> +
> +    if(!timer_started)
> +    {   
> +        /* the first vcpu starts the timer for the first time*/
> +        timer_started = 1;
> +        set_timer(&repl_timer,svc->cur_deadline);
> +    }
>  }
>  
This also seems to be gone in v3, which is good. In fact, it uses
timer_started, which I already said I didn't like.

About the actual startup of the timer (no matter whether for first time
or not). Here, you were doing it in _vcpu_insert() and not in
_vcpu_wake(); in v3 you're doing it in _vcpu_wake() and not in
_runq_insert()... Which one is the proper way?

> @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const
> cpumask_t *mask)
>  }
>  
>  /*
> - * Update vcpu's budget and
> - * sort runq by insert the modifed vcpu back to runq
> - * lock is grabbed before calling this function
> - */
> -static void
> -__repl_update(const struct scheduler *ops, s_time_t now)
> -{
> 
Please, allow me to say that seeing this function going away, fills my
heart with pure joy!! :-D

> @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t
> now, bool_t tasklet_work_sched
>          }
>      }
>  
> -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
> +    ret.time = snext->budget; /* invoke the scheduler next time */
>      ret.task = snext->vcpu;
>  
This is ok as it is done in v3 (i.e., snext->budget if !idle, -1 if
idle).

> @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler *ops,
> struct vcpu *vc)
>      /* insert svc to runq/depletedq because svc is not in queue now
> */
>      __runq_insert(ops, svc);
>  
> -    __repl_update(ops, now);
> -
> -    ASSERT(!list_empty(&prv->sdom));
> -    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
> -    online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
> -    snext = __runq_pick(ops, online); /* pick snext from ALL valid
> cpus */
> -
> -    runq_tickle(ops, snext);
> +    runq_tickle(ops, svc);
> 
And this is another thing I especially like of this patch: it makes the
wakeup path a lot simpler and a lot more similar to how it looks like
in the other schedulers.

Good job with this. :-)

> @@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler *ops,
> struct vcpu *vc)
>      if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
>           likely(vcpu_runnable(vc)) )
>      {
> +        /* only insert the pre-empted vcpu back*/
>          __runq_insert(ops, svc);
> -        __repl_update(ops, NOW());
> -
> -        ASSERT(!list_empty(&prv->sdom));
> -        sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
> -        online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
> -        snext = __runq_pick(ops, online); /* pick snext from ALL
> cpus */
> -
> -        runq_tickle(ops, snext);
>      }
Mmm... I'll think about this more and let you know... But out of the
top of my head, I think the tickling has to stay? You preempted a vcpu
from the pcpu where it was running, maybe some other pcpu is either
idle or running a vcpu with a later deadline, and should come and pick
this one up?

> @@ -1167,6 +1130,74 @@ rt_dom_cntl(
>      return rc;
>  }
>  
> +static void repl_handler(void *data){
> +    unsigned long flags;
> +    s_time_t now = NOW();
> +    s_time_t min_repl = LONG_MAX; /* max time used in comparison*/
> +    struct scheduler *ops = data; 
> +    struct rt_private *prv = rt_priv(ops);
> +    struct list_head *runq = rt_runq(ops);
> +    struct list_head *depletedq = rt_depletedq(ops);
> +    struct list_head *iter;
> +    struct list_head *tmp;
> +    struct rt_vcpu *svc = NULL;
> +
> +    spin_lock_irqsave(&prv->lock,flags);
> +
> +    stop_timer(&repl_timer);
> +
> +    list_for_each_safe(iter, tmp, runq)
> +    {
> 
So, I'm a bit lost here. Why does the _replenishment_ timer's handler
scans the runqueue (and, in v3, the running-queue as well)?

I'd expect the replenishment timer to do, ehm, replenishments... And I
wouldn't expect a vcpu that is either in the runqueue or running to
need a replenishment (and, in case it would, I don't think we should
take care of that here).

> +        svc = __q_elem(iter);
> +        if ( now < svc->cur_deadline )
> +            break;
> +        rt_update_deadline(now, svc);
> +        /* scan the runq to find the min release time 
> +         * this happens when vcpus on runq miss deadline
> +         */
> 
This is exactly my point. It looks to me that you're trying to catch
ready or running vcpus missing their deadline in here, in the
replenishment timer. I don't think this is appropriate... It makes the
logic of the timer handler a lot more complicated than it should be.

Oh, and one thing: the use of the term "release time" is IMO a bit
misleading. Release of what? Typically, the release time of an RT task
(or job) is when the task (or job) is declared ready to run... But I
don't think it's used like this in here.

I propose to just get rid of it.

> +        if( min_repl> svc->cur_deadline )
> +        {
> +            min_repl = svc->cur_deadline;
> +        }
> +        /* reinsert the vcpu if its deadline is updated */
> +        __q_remove(svc);
> +        __runq_insert(ops, svc);
> 
One more proof of what I was trying to say. Is it really this handler's
job to --basically-- re-sort the runqueue? I don't think so.

What is the specific situation that you are trying to handle like this?

In fact, this is also why I'm not convinced of the fact that we need
the additional queue for running vcpus. Later in the thread, Meng
says: 

 "the current running VCPUs on cores also need to replenish their 
  budgets at the beginning of their next periods."

And he makes the following example:

 "[a backlogged] VCPU has its period equal to its budget. Suppose this
  VCPU is the only VCPU on a 4 core machine. This VCPU should keep
  running on one core and never be put back to runq. In the current
  code, this VCPU won't have its budget replenished."

But I don't think I understand. When a vcpu runs out of budget, either:
 a. it needs an immediate replenishment
 b. it needs to go to depletedq, and a replenishment event for it 
    programmed (which may or may not require re-arming the 
    replenishment timer)

Meng's example falls in a., AFAICT, and we can just deal with that when
we handle the 'budget exhausted' event (in rt_schedule(), in this case,
I think).

The case you refer to in the comment above ("when vcpus on runq miss
deadline") can either fall in a. or in b., but in both cases it seems
to me that you can handle it when it happens, instead than inside this
timer handling routine.

> +
> +    /* if timer was triggered but none of the vcpus
> +     * need to be refilled, set the timer to be the   
> +     * default period + now
> +     */
> +    if(min_repl == LONG_MAX)
> +    {
> +        set_timer(&repl_timer, now + RTDS_DEFAULT_PERIOD);
> 
I agree with Meng's point in this thread: this should not be necessary.
If it is, it's most likely because of a bug or to something else.

Let's figure out what it is, and fix it properly. (I see that in v3
this logic is gone, so hopefully you found and fixed the issue
already.)

Thanks and Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
Tianyang Chen Jan. 25, 2016, 10:04 p.m. UTC | #7
I have removed some of the Ccs so they won't get bothered as we 
discussed previously.

On 1/25/2016 4:00 AM, Dario Faggioli wrote:
> On Thu, 2015-12-31 at 05:20 -0500, Tianyang Chen wrote:
>>
>> @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops;
>>    * Global lock is referenced by schedule_data.schedule_lock from all
>>    * physical cpus. It can be grabbed via vcpu_schedule_lock_irq()
>>    */
>> +
>> +/* dedicated timer for replenishment */
>> +static struct timer repl_timer;
>> +
> So, there's always only one timer... Even if we have multiple cpupool
> with RTDS as their scheduler, they share the replenishment timer? I
> think it makes more sense to make this per-scheduler.
>
Yeah, I totally ignored the case for cpu-pools. It looks like when a 
cpu-pool is created, it copies the scheduler struct and calls rt_init() 
where a private field is initialized. So I assume the timer should be 
put inside the scheduler private struct? Now that I think about it, the 
timer is hard-coded to run on cpu0. If there're lots of cpu-pools but 
the replenishment can only be done on the same pcpu, would that be a 
problem? Should we keep track of all instances of schedulers (nr_rt_ops 
counts how many) and just put times on different pcpus?

>> +/* controls when to first start the timer*/
>> +static int timer_started;
>> +
> I don't like this, and I don't think we need it. In fact, you removed
> it yourself from v3, AFAICT.
>
>> @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops,
>> struct vcpu *vc)
>>
>>       /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */
>>       list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
>> +
>> +    if(!timer_started)
>> +    {
>> +        /* the first vcpu starts the timer for the first time*/
>> +        timer_started = 1;
>> +        set_timer(&repl_timer,svc->cur_deadline);
>> +    }
>>   }
>>
> This also seems to be gone in v3, which is good. In fact, it uses
> timer_started, which I already said I didn't like.
>
> About the actual startup of the timer (no matter whether for first time
> or not). Here, you were doing it in _vcpu_insert() and not in
> _vcpu_wake(); in v3 you're doing it in _vcpu_wake() and not in
> _runq_insert()... Which one is the proper way?
>

Correct me if I'm wrong, at the beginning of the boot process, all vcpus 
are put to sleep/not_runnable after insertions. Therefore, the timer 
should start when the first vcpu wakes up. I think the wake() in v3 
should be correct.

>> @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const
>> cpumask_t *mask)
>>   }
>>
>>   /*
>> - * Update vcpu's budget and
>> - * sort runq by insert the modifed vcpu back to runq
>> - * lock is grabbed before calling this function
>> - */
>> -static void
>> -__repl_update(const struct scheduler *ops, s_time_t now)
>> -{
>>
> Please, allow me to say that seeing this function going away, fills my
> heart with pure joy!! :-D
>
>> @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t
>> now, bool_t tasklet_work_sched
>>           }
>>       }
>>
>> -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
>> +    ret.time = snext->budget; /* invoke the scheduler next time */
>>       ret.task = snext->vcpu;
>>
> This is ok as it is done in v3 (i.e., snext->budget if !idle, -1 if
> idle).
>
>> @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler *ops,
>> struct vcpu *vc)
>>       /* insert svc to runq/depletedq because svc is not in queue now
>> */
>>       __runq_insert(ops, svc);
>>
>> -    __repl_update(ops, now);
>> -
>> -    ASSERT(!list_empty(&prv->sdom));
>> -    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
>> -    online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
>> -    snext = __runq_pick(ops, online); /* pick snext from ALL valid
>> cpus */
>> -
>> -    runq_tickle(ops, snext);
>> +    runq_tickle(ops, svc);
>>
> And this is another thing I especially like of this patch: it makes the
> wakeup path a lot simpler and a lot more similar to how it looks like
> in the other schedulers.
>
> Good job with this. :-)
>
>> @@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler *ops,
>> struct vcpu *vc)
>>       if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
>>            likely(vcpu_runnable(vc)) )
>>       {
>> +        /* only insert the pre-empted vcpu back*/
>>           __runq_insert(ops, svc);
>> -        __repl_update(ops, NOW());
>> -
>> -        ASSERT(!list_empty(&prv->sdom));
>> -        sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
>> -        online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
>> -        snext = __runq_pick(ops, online); /* pick snext from ALL
>> cpus */
>> -
>> -        runq_tickle(ops, snext);
>>       }
> Mmm... I'll think about this more and let you know... But out of the
> top of my head, I think the tickling has to stay? You preempted a vcpu
> from the pcpu where it was running, maybe some other pcpu is either
> idle or running a vcpu with a later deadline, and should come and pick
> this one up?
>
gEDF allows this but there is overhead and may not be worth it. I have 
no stats to support this but there are some papers on restricting what 
tasks can migrate. We can discuss more if we need extra logic here.

>> @@ -1167,6 +1130,74 @@ rt_dom_cntl(
>>       return rc;
>>   }
>>
>> +static void repl_handler(void *data){
>> +    unsigned long flags;
>> +    s_time_t now = NOW();
>> +    s_time_t min_repl = LONG_MAX; /* max time used in comparison*/
>> +    struct scheduler *ops = data;
>> +    struct rt_private *prv = rt_priv(ops);
>> +    struct list_head *runq = rt_runq(ops);
>> +    struct list_head *depletedq = rt_depletedq(ops);
>> +    struct list_head *iter;
>> +    struct list_head *tmp;
>> +    struct rt_vcpu *svc = NULL;
>> +
>> +    spin_lock_irqsave(&prv->lock,flags);
>> +
>> +    stop_timer(&repl_timer);
>> +
>> +    list_for_each_safe(iter, tmp, runq)
>> +    {
>>
> So, I'm a bit lost here. Why does the _replenishment_ timer's handler
> scans the runqueue (and, in v3, the running-queue as well)?
>
> I'd expect the replenishment timer to do, ehm, replenishments... And I
> wouldn't expect a vcpu that is either in the runqueue or running to
> need a replenishment (and, in case it would, I don't think we should
> take care of that here).
>
>> +        svc = __q_elem(iter);
>> +        if ( now < svc->cur_deadline )
>> +            break;
>> +        rt_update_deadline(now, svc);
>> +        /* scan the runq to find the min release time
>> +         * this happens when vcpus on runq miss deadline
>> +         */
>>
> This is exactly my point. It looks to me that you're trying to catch
> ready or running vcpus missing their deadline in here, in the
> replenishment timer. I don't think this is appropriate... It makes the
> logic of the timer handler a lot more complicated than it should be.
>
> Oh, and one thing: the use of the term "release time" is IMO a bit
> misleading. Release of what? Typically, the release time of an RT task
> (or job) is when the task (or job) is declared ready to run... But I
> don't think it's used like this in here.
>
> I propose to just get rid of it.
>
The "release time" here means the next time when a deferrable server is 
released and ready to serve. It happens every period. Maybe the term 
"inter-release time" is more appropriate?
>> +        if( min_repl> svc->cur_deadline )
>> +        {
>> +            min_repl = svc->cur_deadline;
>> +        }
>> +        /* reinsert the vcpu if its deadline is updated */
>> +        __q_remove(svc);
>> +        __runq_insert(ops, svc);
>>
> One more proof of what I was trying to say. Is it really this handler's
> job to --basically-- re-sort the runqueue? I don't think so.
>
> What is the specific situation that you are trying to handle like this?
>
Right, if we want to count deadline misses, it could be done when a vcpu 
is picked. However, when selecting the most imminent "inter-release 
time" of all runnable vcpu, the head of the runq could be missing its 
deadline and the cur-deadline could be in the past. How do we handle 
this situation? We still need to scan the runq right?

> In fact, this is also why I'm not convinced of the fact that we need
> the additional queue for running vcpus. Later in the thread, Meng
> says:
>
>   "the current running VCPUs on cores also need to replenish their
>    budgets at the beginning of their next periods."
>
> And he makes the following example:
>
>   "[a backlogged] VCPU has its period equal to its budget. Suppose this
>    VCPU is the only VCPU on a 4 core machine. This VCPU should keep
>    running on one core and never be put back to runq. In the current
>    code, this VCPU won't have its budget replenished."
>
> But I don't think I understand. When a vcpu runs out of budget, either:
>   a. it needs an immediate replenishment
>   b. it needs to go to depletedq, and a replenishment event for it
>      programmed (which may or may not require re-arming the
>      replenishment timer)
>
> Meng's example falls in a., AFAICT, and we can just deal with that when
> we handle the 'budget exhausted' event (in rt_schedule(), in this case,
> I think).
>
> The case you refer to in the comment above ("when vcpus on runq miss
> deadline") can either fall in a. or in b., but in both cases it seems
> to me that you can handle it when it happens, instead than inside this
> timer handling routine.
>
This discussion was before I figured out things about idle_vcpu[] and 
tasklet. A vcpu could be preempted and put back to either runq or 
depletedq if a tasklet is scheduled. It could also end up in a depletedq 
in other situations. I guess Meng's point is this vcpu should be running 
constantly without being taken off if there is no tasklet, in an effort 
to follow EDF.
>> +
>> +    /* if timer was triggered but none of the vcpus
>> +     * need to be refilled, set the timer to be the
>> +     * default period + now
>> +     */
>> +    if(min_repl == LONG_MAX)
>> +    {
>> +        set_timer(&repl_timer, now + RTDS_DEFAULT_PERIOD);
>>
> I agree with Meng's point in this thread: this should not be necessary.
> If it is, it's most likely because of a bug or to something else.
>
> Let's figure out what it is, and fix it properly. (I see that in v3
> this logic is gone, so hopefully you found and fixed the issue
> already.)
>
Yeah. Like I said the timer is originally programmed to fire when the 
first vcpu is inserted but all vcpus are not runnable at the beginning 
of boot process. If the timer is triggered before any vcpu wakes up, 
there is nothing on queue at all. This should be fixed with wake() in V3.

Thanks,
Tianyang Chen
Meng Xu Jan. 25, 2016, 11 p.m. UTC | #8
On Mon, Jan 25, 2016 at 5:04 PM, Tianyang Chen <tiche@seas.upenn.edu> wrote:
> I have removed some of the Ccs so they won't get bothered as we discussed
> previously.
>
> On 1/25/2016 4:00 AM, Dario Faggioli wrote:
>>
>> On Thu, 2015-12-31 at 05:20 -0500, Tianyang Chen wrote:
>>>
>>>
>>> @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops;
>>>    * Global lock is referenced by schedule_data.schedule_lock from all
>>>    * physical cpus. It can be grabbed via vcpu_schedule_lock_irq()
>>>    */
>>> +
>>> +/* dedicated timer for replenishment */
>>> +static struct timer repl_timer;
>>> +
>>
>> So, there's always only one timer... Even if we have multiple cpupool
>> with RTDS as their scheduler, they share the replenishment timer? I
>> think it makes more sense to make this per-scheduler.
>>
> Yeah, I totally ignored the case for cpu-pools. It looks like when a
> cpu-pool is created, it copies the scheduler struct and calls rt_init()
> where a private field is initialized. So I assume the timer should be put
> inside the scheduler private struct? Now that I think about it, the timer is
> hard-coded to run on cpu0. If there're lots of cpu-pools but the
> replenishment can only be done on the same pcpu, would that be a problem?
> Should we keep track of all instances of schedulers (nr_rt_ops counts how
> many) and just put times on different pcpus?
>
>>> +/* controls when to first start the timer*/
>>> +static int timer_started;
>>> +
>>
>> I don't like this, and I don't think we need it. In fact, you removed
>> it yourself from v3, AFAICT.
>>
>>> @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops,
>>> struct vcpu *vc)
>>>
>>>       /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */
>>>       list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
>>> +
>>> +    if(!timer_started)
>>> +    {
>>> +        /* the first vcpu starts the timer for the first time*/
>>> +        timer_started = 1;
>>> +        set_timer(&repl_timer,svc->cur_deadline);
>>> +    }
>>>   }
>>>
>> This also seems to be gone in v3, which is good. In fact, it uses
>> timer_started, which I already said I didn't like.
>>
>> About the actual startup of the timer (no matter whether for first time
>> or not). Here, you were doing it in _vcpu_insert() and not in
>> _vcpu_wake(); in v3 you're doing it in _vcpu_wake() and not in
>> _runq_insert()... Which one is the proper way?
>>
>
> Correct me if I'm wrong, at the beginning of the boot process, all vcpus are
> put to sleep/not_runnable after insertions. Therefore, the timer should
> start when the first vcpu wakes up. I think the wake() in v3 should be
> correct.
>
>
>>> @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const
>>> cpumask_t *mask)
>>>   }
>>>
>>>   /*
>>> - * Update vcpu's budget and
>>> - * sort runq by insert the modifed vcpu back to runq
>>> - * lock is grabbed before calling this function
>>> - */
>>> -static void
>>> -__repl_update(const struct scheduler *ops, s_time_t now)
>>> -{
>>>
>> Please, allow me to say that seeing this function going away, fills my
>> heart with pure joy!! :-D
>>
>>> @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t
>>> now, bool_t tasklet_work_sched
>>>           }
>>>       }
>>>
>>> -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
>>> +    ret.time = snext->budget; /* invoke the scheduler next time */
>>>       ret.task = snext->vcpu;
>>>
>> This is ok as it is done in v3 (i.e., snext->budget if !idle, -1 if
>> idle).
>>
>>> @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler *ops,
>>> struct vcpu *vc)
>>>       /* insert svc to runq/depletedq because svc is not in queue now
>>> */
>>>       __runq_insert(ops, svc);
>>>
>>> -    __repl_update(ops, now);
>>> -
>>> -    ASSERT(!list_empty(&prv->sdom));
>>> -    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
>>> -    online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
>>> -    snext = __runq_pick(ops, online); /* pick snext from ALL valid
>>> cpus */
>>> -
>>> -    runq_tickle(ops, snext);
>>> +    runq_tickle(ops, svc);
>>>
>> And this is another thing I especially like of this patch: it makes the
>> wakeup path a lot simpler and a lot more similar to how it looks like
>> in the other schedulers.
>>
>> Good job with this. :-)
>>
>>> @@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler *ops,
>>> struct vcpu *vc)
>>>       if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
>>>            likely(vcpu_runnable(vc)) )
>>>       {
>>> +        /* only insert the pre-empted vcpu back*/
>>>           __runq_insert(ops, svc);
>>> -        __repl_update(ops, NOW());
>>> -
>>> -        ASSERT(!list_empty(&prv->sdom));
>>> -        sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
>>> -        online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
>>> -        snext = __runq_pick(ops, online); /* pick snext from ALL
>>> cpus */
>>> -
>>> -        runq_tickle(ops, snext);
>>>       }
>>
>> Mmm... I'll think about this more and let you know... But out of the
>> top of my head, I think the tickling has to stay? You preempted a vcpu
>> from the pcpu where it was running, maybe some other pcpu is either
>> idle or running a vcpu with a later deadline, and should come and pick
>> this one up?

If that's the case, why should we preempt this VCPU? We should use the
top VCPu in the runq to preempt the lowest priority VCPU, right?

>>
> gEDF allows this but there is overhead and may not be worth it. I have no
> stats to support this but there are some papers on restricting what tasks
> can migrate. We can discuss more if we need extra logic here.

As to gEDF, the scheduling policy does not restrict what tasks can
migrate, except for the VCPU's hard affinity set by users.  So
migrating is an option. but we should avoid the unnecessary
preemption/migration.

>
>
>>> @@ -1167,6 +1130,74 @@ rt_dom_cntl(
>>>       return rc;
>>>   }
>>>
>>> +static void repl_handler(void *data){
>>> +    unsigned long flags;
>>> +    s_time_t now = NOW();
>>> +    s_time_t min_repl = LONG_MAX; /* max time used in comparison*/
>>> +    struct scheduler *ops = data;
>>> +    struct rt_private *prv = rt_priv(ops);
>>> +    struct list_head *runq = rt_runq(ops);
>>> +    struct list_head *depletedq = rt_depletedq(ops);
>>> +    struct list_head *iter;
>>> +    struct list_head *tmp;
>>> +    struct rt_vcpu *svc = NULL;
>>> +
>>> +    spin_lock_irqsave(&prv->lock,flags);
>>> +
>>> +    stop_timer(&repl_timer);
>>> +
>>> +    list_for_each_safe(iter, tmp, runq)
>>> +    {
>>>
>> So, I'm a bit lost here. Why does the _replenishment_ timer's handler
>> scans the runqueue (and, in v3, the running-queue as well)?
>>
>> I'd expect the replenishment timer to do, ehm, replenishments... And I
>> wouldn't expect a vcpu that is either in the runqueue or running to
>> need a replenishment (and, in case it would, I don't think we should
>> take care of that here).
>>
>>> +        svc = __q_elem(iter);
>>> +        if ( now < svc->cur_deadline )
>>> +            break;
>>> +        rt_update_deadline(now, svc);
>>> +        /* scan the runq to find the min release time
>>> +         * this happens when vcpus on runq miss deadline
>>> +         */
>>>
>> This is exactly my point. It looks to me that you're trying to catch
>> ready or running vcpus missing their deadline in here, in the
>> replenishment timer. I don't think this is appropriate... It makes the
>> logic of the timer handler a lot more complicated than it should be.
>>
>> Oh, and one thing: the use of the term "release time" is IMO a bit
>> misleading. Release of what? Typically, the release time of an RT task
>> (or job) is when the task (or job) is declared ready to run... But I
>> don't think it's used like this in here.
>>
>> I propose to just get rid of it.
>>
> The "release time" here means the next time when a deferrable server is
> released and ready to serve. It happens every period. Maybe the term
> "inter-release time" is more appropriate?
>>>
>>> +        if( min_repl> svc->cur_deadline )
>>> +        {
>>> +            min_repl = svc->cur_deadline;
>>> +        }
>>> +        /* reinsert the vcpu if its deadline is updated */
>>> +        __q_remove(svc);
>>> +        __runq_insert(ops, svc);
>>>
>> One more proof of what I was trying to say. Is it really this handler's
>> job to --basically-- re-sort the runqueue? I don't think so.
>>
>> What is the specific situation that you are trying to handle like this?
>>
> Right, if we want to count deadline misses, it could be done when a vcpu is
> picked. However, when selecting the most imminent "inter-release time" of
> all runnable vcpu, the head of the runq could be missing its deadline and
> the cur-deadline could be in the past. How do we handle this situation? We
> still need to scan the runq right?

I think Dario's point is that:
All VCPUs on runq should still have remaining budget, so they should
not have come into the situation of replenishing their budget. So in
the replenishment handler, runq should not be called. I agree the runq
should not be called to replenish the budget. But the top VCPU in the
runq may be the next earliest one that should get its budget
replenished. Probably, we can reset the replenish timer whenever a
VCPU is inserted into the depletedq, because that is the point when a
replenish timer should be armed for the new depleted VCPU and the
replenish time may be changed.

>
>> In fact, this is also why I'm not convinced of the fact that we need
>> the additional queue for running vcpus. Later in the thread, Meng
>> says:
>>
>>   "the current running VCPUs on cores also need to replenish their
>>    budgets at the beginning of their next periods."
>>
>> And he makes the following example:
>>
>>   "[a backlogged] VCPU has its period equal to its budget. Suppose this
>>    VCPU is the only VCPU on a 4 core machine. This VCPU should keep
>>    running on one core and never be put back to runq. In the current
>>    code, this VCPU won't have its budget replenished."
>>
>> But I don't think I understand. When a vcpu runs out of budget, either:
>>   a. it needs an immediate replenishment
>>   b. it needs to go to depletedq, and a replenishment event for it
>>      programmed (which may or may not require re-arming the
>>      replenishment timer)
>>
>> Meng's example falls in a., AFAICT, and we can just deal with that when
>> we handle the 'budget exhausted' event (in rt_schedule(), in this case,
>> I think).
>>
>> The case you refer to in the comment above ("when vcpus on runq miss
>> deadline") can either fall in a. or in b., but in both cases it seems
>> to me that you can handle it when it happens, instead than inside this
>> timer handling routine.
>>
> This discussion was before I figured out things about idle_vcpu[] and
> tasklet. A vcpu could be preempted and put back to either runq or depletedq
> if a tasklet is scheduled. It could also end up in a depletedq in other
> situations. I guess Meng's point is this vcpu should be running constantly
> without being taken off if there is no tasklet, in an effort to follow EDF.
Hi Tianyang,

I think Dario made a good point. In order to avoid vcpu being taken
off from the core, we can handle it in the rt_schedule(), the budget
enforcement function. This could probably make the code cleaner.

>>>
>>> +
>>> +    /* if timer was triggered but none of the vcpus
>>> +     * need to be refilled, set the timer to be the
>>> +     * default period + now
>>> +     */
>>> +    if(min_repl == LONG_MAX)
>>> +    {
>>> +        set_timer(&repl_timer, now + RTDS_DEFAULT_PERIOD);
>>>
>> I agree with Meng's point in this thread: this should not be necessary.
>> If it is, it's most likely because of a bug or to something else.
>>
>> Let's figure out what it is, and fix it properly. (I see that in v3
>> this logic is gone, so hopefully you found and fixed the issue
>> already.)
>>
> Yeah. Like I said the timer is originally programmed to fire when the first
> vcpu is inserted but all vcpus are not runnable at the beginning of boot
> process. If the timer is triggered before any vcpu wakes up, there is
> nothing on queue at all. This should be fixed with wake() in V3.

Great! I'm wondering if we should look at the v3 version, which should
have fixed many issues? We can decide if the runningq is needed in v3.
Dario, What do you think? Right now, I'm kind of lost how we should
proceed next step? Should we modify based on this version or the
latest posted version v3?

Best,

Meng


-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/
Dario Faggioli Jan. 26, 2016, 11:52 a.m. UTC | #9
On Mon, 2016-01-25 at 17:04 -0500, Tianyang Chen wrote:
> I have removed some of the Ccs so they won't get bothered as we 
> discussed previously.
> 
Yeah... I said you should have done that in the first place, and then
Cc-ed them myself! Sorry... :-P

> On 1/25/2016 4:00 AM, Dario Faggioli wrote:
> > On Thu, 2015-12-31 at 05:20 -0500, Tianyang Chen wrote:
> > > 
> > So, there's always only one timer... Even if we have multiple
> > cpupool
> > with RTDS as their scheduler, they share the replenishment timer? I
> > think it makes more sense to make this per-scheduler.
> > 
> Yeah, I totally ignored the case for cpu-pools. It looks like when a 
> cpu-pool is created, it copies the scheduler struct and calls
> rt_init() 
> where a private field is initialized. So I assume the timer should
> be 
> put inside the scheduler private struct? 
>
Yes, I think it should be there. We certainly don't want different
cpupools to share the timer.

> Now that I think about it, the 
> timer is hard-coded to run on cpu0.
>
It is. Well, in your patch it is "hard-coded" to cpu0. When considering
cpupools, you just hard-code it to one cpu of the pool.

In fact, the fact that the timer is sort-of pinned to a pcpu is a
(potential) issue (overhead on that pcpu, what happens if that pcpu
goes offline), but let's deal with all this later. For now, make the
code cpupools-safe.

>  If there're lots of cpu-pools but 
> the replenishment can only be done on the same pcpu, would that be a 
> problem? Should we keep track of all instances of schedulers
> (nr_rt_ops 
> counts how many) and just put times on different pcpus?
>
One timer per cpupool is what we want, at least for now.

> > About the actual startup of the timer (no matter whether for first
> > time
> > or not). Here, you were doing it in _vcpu_insert() and not in
> > _vcpu_wake(); in v3 you're doing it in _vcpu_wake() and not in
> > _runq_insert()... Which one is the proper way?
> > 
> 
> Correct me if I'm wrong, at the beginning of the boot process, all
> vcpus 
> are put to sleep/not_runnable after insertions. Therefore, the timer 
> should start when the first vcpu wakes up. I think the wake() in v3 
> should be correct.
> 
Check when the insert_vcpu is called in schedule.c (hint, this also has
to do with cpupools()). I think that starting it in wake() is ok, but,
really, do double check (and, once you're ready for that, test things
by creating multiple pools and moving domains around between them).

> > Mmm... I'll think about this more and let you know... But out of
> > the
> > top of my head, I think the tickling has to stay? You preempted a
> > vcpu
> > from the pcpu where it was running, maybe some other pcpu is either
> > idle or running a vcpu with a later deadline, and should come and
> > pick
> > this one up?
> > 
> gEDF allows this but there is overhead and may not be worth it. I
> have 
> no stats to support this but there are some papers on restricting
> what 
> tasks can migrate. We can discuss more if we need extra logic here.
> 
Ok (more on this in the reply to Meng's email).

> > Oh, and one thing: the use of the term "release time" is IMO a bit
> > misleading. Release of what? Typically, the release time of an RT
> > task
> > (or job) is when the task (or job) is declared ready to run... But
> > I
> > don't think it's used like this in here.
> > 
> > I propose to just get rid of it.
> > 
> The "release time" here means the next time when a deferrable server
> is 
> released and ready to serve. It happens every period. Maybe the term 
> "inter-release time" is more appropriate?
>
Perhaps, but I think this part of the DS algorithm can be implemented
in a smart enough way to avoid having to deal with this "explicitly"
(and in particular, having to scan the running or ready queues during
replenishment).

> > > +        if( min_repl> svc->cur_deadline )
> > > +        {
> > > +            min_repl = svc->cur_deadline;
> > > +        }
> > > +        /* reinsert the vcpu if its deadline is updated */
> > > +        __q_remove(svc);
> > > +        __runq_insert(ops, svc);
> > > 
> > One more proof of what I was trying to say. Is it really this
> > handler's
> > job to --basically-- re-sort the runqueue? I don't think so.
> > 
> > What is the specific situation that you are trying to handle like
> > this?
> > 
> Right, if we want to count deadline misses, it could be done when a
> vcpu 
> is picked. However, when selecting the most imminent "inter-release 
> time" of all runnable vcpu, the head of the runq could be missing
> its 
> deadline and the cur-deadline could be in the past. How do we handle 
> this situation? We still need to scan the runq right?
> 
I'll do my best to avoid that we'll end up scanning the runqueue in the
replenishment timer handler, and in fact I still don't think this is
going to be necessary.

Let's discuss more about this specific point when replying to Meng's
email.

> > But I don't think I understand. When a vcpu runs out of budget,
> > either:
> >   a. it needs an immediate replenishment
> >   b. it needs to go to depletedq, and a replenishment event for it
> >      programmed (which may or may not require re-arming the
> >      replenishment timer)
> > 
> > Meng's example falls in a., AFAICT, and we can just deal with that
> > when
> > we handle the 'budget exhausted' event (in rt_schedule(), in this
> > case,
> > I think).
> > 
> > The case you refer to in the comment above ("when vcpus on runq
> > miss
> > deadline") can either fall in a. or in b., but in both cases it
> > seems
> > to me that you can handle it when it happens, instead than inside
> > this
> > timer handling routine.
> > 
> This discussion was before I figured out things about idle_vcpu[]
> and 
> tasklet. A vcpu could be preempted and put back to either runq or 
> depletedq if a tasklet is scheduled. It could also end up in a
> depletedq 
> in other situations. I guess Meng's point is this vcpu should be
> running 
> constantly without being taken off if there is no tasklet, in an
> effort 
> to follow EDF.
>
So, whatever it is the reason that triggered the call to schedule() --
either a tasklet or anything-- a vcpu that has exhausted its budget
should either:
 - be replenished immediately, and hence continue to run (as soon as 
   possible), if it has a replenishment pending that has not been 
   performed yet.
 - go to the depleted queue and wait for one.

Isn't this the case?

Regards,
Dario
Dario Faggioli Jan. 26, 2016, 2:06 p.m. UTC | #10
On Mon, 2016-01-25 at 18:00 -0500, Meng Xu wrote:
> On Mon, Jan 25, 2016 at 5:04 PM, Tianyang Chen <tiche@seas.upenn.edu>
> wrote:
> > I have removed some of the Ccs so they won't get bothered as we
> > discussed
> > previously.
> > 
> > On 1/25/2016 4:00 AM, Dario Faggioli wrote:
> > > 
> > > On Thu, 2015-12-31 at 05:20 -0500, Tianyang Chen wrote:
> > > > 
> > > > 
> > > > @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops;
> > > >    * Global lock is referenced by schedule_data.schedule_lock
> > > > from all
> > > >    * physical cpus. It can be grabbed via
> > > > vcpu_schedule_lock_irq()
> > > >    */
> > > > +
> > > > +/* dedicated timer for replenishment */
> > > > +static struct timer repl_timer;
> > > > +
> > > 
> > > So, there's always only one timer... Even if we have multiple
> > > cpupool
> > > with RTDS as their scheduler, they share the replenishment timer?
> > > I
> > > think it makes more sense to make this per-scheduler.
> > > 
> > Yeah, I totally ignored the case for cpu-pools. It looks like when
> > a
> > cpu-pool is created, it copies the scheduler struct and calls
> > rt_init()
> > where a private field is initialized. So I assume the timer should
> > be put
> > inside the scheduler private struct? Now that I think about it, the
> > timer is
> > hard-coded to run on cpu0. If there're lots of cpu-pools but the
> > replenishment can only be done on the same pcpu, would that be a
> > problem?
> > Should we keep track of all instances of schedulers (nr_rt_ops
> > counts how
> > many) and just put times on different pcpus?
> > 
> > > > +/* controls when to first start the timer*/
> > > > +static int timer_started;
> > > > +
> > > 
> > > I don't like this, and I don't think we need it. In fact, you
> > > removed
> > > it yourself from v3, AFAICT.
> > > 
> > > > @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler
> > > > *ops,
> > > > struct vcpu *vc)
> > > > 
> > > >       /* add rt_vcpu svc to scheduler-specific vcpu list of the
> > > > dom */
> > > >       list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
> > > > +
> > > > +    if(!timer_started)
> > > > +    {
> > > > +        /* the first vcpu starts the timer for the first
> > > > time*/
> > > > +        timer_started = 1;
> > > > +        set_timer(&repl_timer,svc->cur_deadline);
> > > > +    }
> > > >   }
> > > > 
> > > This also seems to be gone in v3, which is good. In fact, it uses
> > > timer_started, which I already said I didn't like.
> > > 
> > > About the actual startup of the timer (no matter whether for
> > > first time
> > > or not). Here, you were doing it in _vcpu_insert() and not in
> > > _vcpu_wake(); in v3 you're doing it in _vcpu_wake() and not in
> > > _runq_insert()... Which one is the proper way?
> > > 
> > 
> > Correct me if I'm wrong, at the beginning of the boot process, all
> > vcpus are
> > put to sleep/not_runnable after insertions. Therefore, the timer
> > should
> > start when the first vcpu wakes up. I think the wake() in v3 should
> > be
> > correct.
> > 
> > 
> > > > @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops,
> > > > const
> > > > cpumask_t *mask)
> > > >   }
> > > > 
> > > >   /*
> > > > - * Update vcpu's budget and
> > > > - * sort runq by insert the modifed vcpu back to runq
> > > > - * lock is grabbed before calling this function
> > > > - */
> > > > -static void
> > > > -__repl_update(const struct scheduler *ops, s_time_t now)
> > > > -{
> > > > 
> > > Please, allow me to say that seeing this function going away,
> > > fills my
> > > heart with pure joy!! :-D
> > > 
> > > > @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops,
> > > > s_time_t
> > > > now, bool_t tasklet_work_sched
> > > >           }
> > > >       }
> > > > 
> > > > -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched
> > > > quantum */
> > > > +    ret.time = snext->budget; /* invoke the scheduler next
> > > > time */
> > > >       ret.task = snext->vcpu;
> > > > 
> > > This is ok as it is done in v3 (i.e., snext->budget if !idle, -1
> > > if
> > > idle).
> > > 
> > > > @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler
> > > > *ops,
> > > > struct vcpu *vc)
> > > >       /* insert svc to runq/depletedq because svc is not in
> > > > queue now
> > > > */
> > > >       __runq_insert(ops, svc);
> > > > 
> > > > -    __repl_update(ops, now);
> > > > -
> > > > -    ASSERT(!list_empty(&prv->sdom));
> > > > -    sdom = list_entry(prv->sdom.next, struct rt_dom,
> > > > sdom_elem);
> > > > -    online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
> > > > -    snext = __runq_pick(ops, online); /* pick snext from ALL
> > > > valid
> > > > cpus */
> > > > -
> > > > -    runq_tickle(ops, snext);
> > > > +    runq_tickle(ops, svc);
> > > > 
> > > And this is another thing I especially like of this patch: it
> > > makes the
> > > wakeup path a lot simpler and a lot more similar to how it looks
> > > like
> > > in the other schedulers.
> > > 
> > > Good job with this. :-)
> > > 
> > > > @@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler
> > > > *ops,
> > > > struct vcpu *vc)
> > > >       if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc-
> > > > >flags) &&
> > > >            likely(vcpu_runnable(vc)) )
> > > >       {
> > > > +        /* only insert the pre-empted vcpu back*/
> > > >           __runq_insert(ops, svc);
> > > > -        __repl_update(ops, NOW());
> > > > -
> > > > -        ASSERT(!list_empty(&prv->sdom));
> > > > -        sdom = list_entry(prv->sdom.next, struct rt_dom,
> > > > sdom_elem);
> > > > -        online = cpupool_scheduler_cpumask(sdom->dom-
> > > > >cpupool);
> > > > -        snext = __runq_pick(ops, online); /* pick snext from
> > > > ALL
> > > > cpus */
> > > > -
> > > > -        runq_tickle(ops, snext);
> > > >       }
> > > 
> > > Mmm... I'll think about this more and let you know... But out of
> > > the
> > > top of my head, I think the tickling has to stay? You preempted a
> > > vcpu
> > > from the pcpu where it was running, maybe some other pcpu is
> > > either
> > > idle or running a vcpu with a later deadline, and should come and
> > > pick
> > > this one up?
> 
> If that's the case, why should we preempt this VCPU? We should use
> the
> top VCPu in the runq to preempt the lowest priority VCPU, right?
> 
Yeah, in theory, and as far as things goes by "just" looking at
runq_tickle() in sched_rt.c, you're right.

Tickling is (like everything in life :-P) not perfect, though. At least
it's not "instantaneously" that a tickled pcpu come and pick up work...
Something may happen that perturbates the scenario one depicted in his
head when thinking at what will happen after tickling (e.g., the pcpu
being tickled may be doing something which can't be interrupted for a
while).

The scheduler should be robust, and don't explode or don't exhibit
wrong behavior in case, and I think the current code is ok in that
sense.

My idea was that adding one more "tickling point" would make it more
robust, exactly in that regard, i.e., in tolerating anomalies due to
tickling resulting in the pcpu that picked up the work was a different
one from what we expected.

But this indeed introduces more overhead... So, I agree, let's not do
that and see if we encounter problems. We'll come back here if we do.
:-)

> > gEDF allows this but there is overhead and may not be worth it. I
> > have no
> > stats to support this but there are some papers on restricting what
> > tasks
> > can migrate. We can discuss more if we need extra logic here.
> 
> As to gEDF, the scheduling policy does not restrict what tasks can
> migrate, except for the VCPU's hard affinity set by users.  So
> migrating is an option. but we should avoid the unnecessary
> preemption/migration.
> 
Agreed.

> > > > +        if( min_repl> svc->cur_deadline )
> > > > +        {
> > > > +            min_repl = svc->cur_deadline;
> > > > +        }
> > > > +        /* reinsert the vcpu if its deadline is updated */
> > > > +        __q_remove(svc);
> > > > +        __runq_insert(ops, svc);
> > > > 
> > > One more proof of what I was trying to say. Is it really this
> > > handler's
> > > job to --basically-- re-sort the runqueue? I don't think so.
> > > 
> > > What is the specific situation that you are trying to handle like
> > > this?
> > > 
> > Right, if we want to count deadline misses, it could be done when a
> > vcpu is
> > picked. However, when selecting the most imminent "inter-release
> > time" of
> > all runnable vcpu, the head of the runq could be missing its
> > deadline and
> > the cur-deadline could be in the past. How do we handle this
> > situation? We
> > still need to scan the runq right?
> 
> I think Dario's point is that:
> All VCPUs on runq should still have remaining budget, so they should
> not have come into the situation of replenishing their budget. So in
> the replenishment handler, runq should not be called. 
>
Exactly.

> I agree the runq
> should not be called to replenish the budget. But the top VCPU in the
> runq may be the next earliest one that should get its budget
> replenished. 
>
With "the top VCPU in the runq" you mean the one that is running on a
pCPU? No, I don't think you refer to that one since, as you said,
running vCPUs are not in the runqueues.

And then why it is only the first vCPU in the runqueue  you seem to
care about. I appreciate it has the shortest deadline, but I can't tell
if that's enough to assume we only need to care about it. I probably do
not recall with 100% accuracy the details of the DS algorithm that you
want to implement... In particular, whether or not a replenishment need
to be programmed when the task/vcpu becomes running, so do feel free to
summarize it here for me. :-)

In any case, whatever the algorithm says, what I'm proposing is
something completely different and general enough that would,
hopefully, make it easier to reason on things.

Can we have, together with the timer, a list of replenishment events?
I'm talking about an actual list of entries, where each entry contains
the following information:
 - time at which the replenishment should happen
 - amount of replenishment
 - vcpu to be replenished

The list will be kept in order of replenishment time, and the timer
will always be programmed to fire at the most imminent replenishment
time instant (which will correspond to the first entry in the list).

When the timer fires, it picks up the first entry, takes it out of the
list, does the replenishment and takes care of the effects of the
replenishment itself (deadline update, preemption, runq re-insertion,
moving from depletedq to runq), depending on what the status of the
vcpu being replenished was.

After doing all this, the timer reprograms itself to fire at the
replenishment time of the next (which just became the new first) entry
in the list.

At least, this is the idea. Now, for the implementation:

 1. instead of only checking the first entry, it's wise that the timer
    handler start to walk the list, and, considering the current time
    (what NOW() gives you) takes care of all entries whose
    replenishment time has passed. This is to deal with cases where 
    the handler may fire a little bit later than expected, and more 
    than just one replenishment event should have occurred already. 
    Since the list is in order, it is ok to stop as soon as the first
    entry with a replenishment time which is actually in the future is
    found;

 2. embedded lists give us the opportunity to place one data structure 
    in more than one list/queue. So, instead of actually dynamically 
    allocating and using a dedicated data structure for each 
    replenishment event, I think we can:
     - just add another embedded list element in rt_vcpu (just like 
       q_elem),
     - use that to queue the rt_vcpu-sin the replenishment events list
     - add the fields necessary to handle replenishment directly in 
       rt_vcpu (assuming we need any... If next replenishment time is 
       the next absolute deadline and amount is the budget, everything
       we need should already be there)

So.. What do you think?

> > This discussion was before I figured out things about idle_vcpu[]
> > and
> > tasklet. A vcpu could be preempted and put back to either runq or
> > depletedq
> > if a tasklet is scheduled. It could also end up in a depletedq in
> > other
> > situations. I guess Meng's point is this vcpu should be running
> > constantly
> > without being taken off if there is no tasklet, in an effort to
> > follow EDF.
> Hi Tianyang,
> 
> I think Dario made a good point. In order to avoid vcpu being taken
> off from the core, we can handle it in the rt_schedule(), the budget
> enforcement function. This could probably make the code cleaner.
> 
Exactly. Budget enforcement is perfectly fine being done in
rt_schedule().

> > > > 
> > > > +
> > > > +    /* if timer was triggered but none of the vcpus
> > > > +     * need to be refilled, set the timer to be the
> > > > +     * default period + now
> > > > +     */
> > > > +    if(min_repl == LONG_MAX)
> > > > +    {
> > > > +        set_timer(&repl_timer, now + RTDS_DEFAULT_PERIOD);
> > > > 
> > > I agree with Meng's point in this thread: this should not be
> > > necessary.
> > > If it is, it's most likely because of a bug or to something else.
> > > 
> > > Let's figure out what it is, and fix it properly. (I see that in
> > > v3
> > > this logic is gone, so hopefully you found and fixed the issue
> > > already.)
> > > 
> > Yeah. Like I said the timer is originally programmed to fire when
> > the first
> > vcpu is inserted but all vcpus are not runnable at the beginning of
> > boot
> > process. If the timer is triggered before any vcpu wakes up, there
> > is
> > nothing on queue at all. This should be fixed with wake() in V3.
> 
> Great! I'm wondering if we should look at the v3 version, which
> should
> have fixed many issues? We can decide if the runningq is needed in
> v3.
>
The fact that v3 added runningq is the reason why I'm commenting on
this version: I don't think it's necessary at all, and it was easier to
focus on other issues with that out of the way.

What I've seen in v3 seems ok to me. I can take another look, but I
guess the best thing to do would be to put together a v4 with the fixes
to the issue v2 had taken from v3, the runningq taken away, and the
replenished queue implemented as I described (if you agree with it).

> Dario, What do you think? Right now, I'm kind of lost how we should
> proceed next step? Should we modify based on this version or the
> latest posted version v3?
> 
As you wish, but I'd base the new version on this version, and
"backport" good stuff from v3 to here (again, as it doesn't have the
runningq in the way)

Regards,
Dario
Meng Xu Jan. 26, 2016, 5:28 p.m. UTC | #11
Hi Dario and Tianyang,

On Tue, Jan 26, 2016 at 9:06 AM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
> On Mon, 2016-01-25 at 18:00 -0500, Meng Xu wrote:
>> On Mon, Jan 25, 2016 at 5:04 PM, Tianyang Chen <tiche@seas.upenn.edu>
>> wrote:
>> > I have removed some of the Ccs so they won't get bothered as we
>> > discussed
>> > previously.
>> >
>> > On 1/25/2016 4:00 AM, Dario Faggioli wrote:
>> > >
>> > > On Thu, 2015-12-31 at 05:20 -0500, Tianyang Chen wrote:
>> > > >
>> > > >
>> > > > @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops;
>> > > >    * Global lock is referenced by schedule_data.schedule_lock
>> > > > from all
>> > > >    * physical cpus. It can be grabbed via
>> > > > vcpu_schedule_lock_irq()
>> > > >    */
>> > > > +
>> > > > +/* dedicated timer for replenishment */
>> > > > +static struct timer repl_timer;
>> > > > +
>> > >
>> > > So, there's always only one timer... Even if we have multiple
>> > > cpupool
>> > > with RTDS as their scheduler, they share the replenishment timer?
>> > > I
>> > > think it makes more sense to make this per-scheduler.
>> > >
>> > Yeah, I totally ignored the case for cpu-pools. It looks like when
>> > a
>> > cpu-pool is created, it copies the scheduler struct and calls
>> > rt_init()
>> > where a private field is initialized. So I assume the timer should
>> > be put
>> > inside the scheduler private struct? Now that I think about it, the
>> > timer is
>> > hard-coded to run on cpu0. If there're lots of cpu-pools but the
>> > replenishment can only be done on the same pcpu, would that be a
>> > problem?
>> > Should we keep track of all instances of schedulers (nr_rt_ops
>> > counts how
>> > many) and just put times on different pcpus?
>> >
>> > > > +/* controls when to first start the timer*/
>> > > > +static int timer_started;
>> > > > +
>> > >
>> > > I don't like this, and I don't think we need it. In fact, you
>> > > removed
>> > > it yourself from v3, AFAICT.
>> > >
>> > > > @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler
>> > > > *ops,
>> > > > struct vcpu *vc)
>> > > >
>> > > >       /* add rt_vcpu svc to scheduler-specific vcpu list of the
>> > > > dom */
>> > > >       list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
>> > > > +
>> > > > +    if(!timer_started)
>> > > > +    {
>> > > > +        /* the first vcpu starts the timer for the first
>> > > > time*/
>> > > > +        timer_started = 1;
>> > > > +        set_timer(&repl_timer,svc->cur_deadline);
>> > > > +    }
>> > > >   }
>> > > >
>> > > This also seems to be gone in v3, which is good. In fact, it uses
>> > > timer_started, which I already said I didn't like.
>> > >
>> > > About the actual startup of the timer (no matter whether for
>> > > first time
>> > > or not). Here, you were doing it in _vcpu_insert() and not in
>> > > _vcpu_wake(); in v3 you're doing it in _vcpu_wake() and not in
>> > > _runq_insert()... Which one is the proper way?
>> > >
>> >
>> > Correct me if I'm wrong, at the beginning of the boot process, all
>> > vcpus are
>> > put to sleep/not_runnable after insertions. Therefore, the timer
>> > should
>> > start when the first vcpu wakes up. I think the wake() in v3 should
>> > be
>> > correct.
>> >
>> >
>> > > > @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops,
>> > > > const
>> > > > cpumask_t *mask)
>> > > >   }
>> > > >
>> > > >   /*
>> > > > - * Update vcpu's budget and
>> > > > - * sort runq by insert the modifed vcpu back to runq
>> > > > - * lock is grabbed before calling this function
>> > > > - */
>> > > > -static void
>> > > > -__repl_update(const struct scheduler *ops, s_time_t now)
>> > > > -{
>> > > >
>> > > Please, allow me to say that seeing this function going away,
>> > > fills my
>> > > heart with pure joy!! :-D
>> > >
>> > > > @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops,
>> > > > s_time_t
>> > > > now, bool_t tasklet_work_sched
>> > > >           }
>> > > >       }
>> > > >
>> > > > -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched
>> > > > quantum */
>> > > > +    ret.time = snext->budget; /* invoke the scheduler next
>> > > > time */
>> > > >       ret.task = snext->vcpu;
>> > > >
>> > > This is ok as it is done in v3 (i.e., snext->budget if !idle, -1
>> > > if
>> > > idle).
>> > >
>> > > > @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler
>> > > > *ops,
>> > > > struct vcpu *vc)
>> > > >       /* insert svc to runq/depletedq because svc is not in
>> > > > queue now
>> > > > */
>> > > >       __runq_insert(ops, svc);
>> > > >
>> > > > -    __repl_update(ops, now);
>> > > > -
>> > > > -    ASSERT(!list_empty(&prv->sdom));
>> > > > -    sdom = list_entry(prv->sdom.next, struct rt_dom,
>> > > > sdom_elem);
>> > > > -    online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
>> > > > -    snext = __runq_pick(ops, online); /* pick snext from ALL
>> > > > valid
>> > > > cpus */
>> > > > -
>> > > > -    runq_tickle(ops, snext);
>> > > > +    runq_tickle(ops, svc);
>> > > >
>> > > And this is another thing I especially like of this patch: it
>> > > makes the
>> > > wakeup path a lot simpler and a lot more similar to how it looks
>> > > like
>> > > in the other schedulers.
>> > >
>> > > Good job with this. :-)
>> > >
>> > > > @@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler
>> > > > *ops,
>> > > > struct vcpu *vc)
>> > > >       if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc-
>> > > > >flags) &&
>> > > >            likely(vcpu_runnable(vc)) )
>> > > >       {
>> > > > +        /* only insert the pre-empted vcpu back*/
>> > > >           __runq_insert(ops, svc);
>> > > > -        __repl_update(ops, NOW());
>> > > > -
>> > > > -        ASSERT(!list_empty(&prv->sdom));
>> > > > -        sdom = list_entry(prv->sdom.next, struct rt_dom,
>> > > > sdom_elem);
>> > > > -        online = cpupool_scheduler_cpumask(sdom->dom-
>> > > > >cpupool);
>> > > > -        snext = __runq_pick(ops, online); /* pick snext from
>> > > > ALL
>> > > > cpus */
>> > > > -
>> > > > -        runq_tickle(ops, snext);
>> > > >       }
>> > >
>> > > Mmm... I'll think about this more and let you know... But out of
>> > > the
>> > > top of my head, I think the tickling has to stay? You preempted a
>> > > vcpu
>> > > from the pcpu where it was running, maybe some other pcpu is
>> > > either
>> > > idle or running a vcpu with a later deadline, and should come and
>> > > pick
>> > > this one up?
>>
>> If that's the case, why should we preempt this VCPU? We should use
>> the
>> top VCPu in the runq to preempt the lowest priority VCPU, right?
>>
> Yeah, in theory, and as far as things goes by "just" looking at
> runq_tickle() in sched_rt.c, you're right.
>
> Tickling is (like everything in life :-P) not perfect, though. At least
> it's not "instantaneously" that a tickled pcpu come and pick up work...
> Something may happen that perturbates the scenario one depicted in his
> head when thinking at what will happen after tickling (e.g., the pcpu
> being tickled may be doing something which can't be interrupted for a
> while).
>
> The scheduler should be robust, and don't explode or don't exhibit
> wrong behavior in case, and I think the current code is ok in that
> sense.
>
> My idea was that adding one more "tickling point" would make it more
> robust, exactly in that regard, i.e., in tolerating anomalies due to
> tickling resulting in the pcpu that picked up the work was a different
> one from what we expected.
>
> But this indeed introduces more overhead... So, I agree, let's not do
> that and see if we encounter problems. We'll come back here if we do.
> :-)

I see the point. OK. We can do some test to see when the extra
tickling point should be used by adding some temporary warning print
in the code and see if it's called and if we can avoid it. If it's not
called too frequently, it may be ok, (which I'm not so sure), that we
use extra tickle for it. (Maybe have some fast path in the tickle for
the common cases.)

>
>> > gEDF allows this but there is overhead and may not be worth it. I
>> > have no
>> > stats to support this but there are some papers on restricting what
>> > tasks
>> > can migrate. We can discuss more if we need extra logic here.
>>
>> As to gEDF, the scheduling policy does not restrict what tasks can
>> migrate, except for the VCPU's hard affinity set by users.  So
>> migrating is an option. but we should avoid the unnecessary
>> preemption/migration.
>>
> Agreed.
>
>> > > > +        if( min_repl> svc->cur_deadline )
>> > > > +        {
>> > > > +            min_repl = svc->cur_deadline;
>> > > > +        }
>> > > > +        /* reinsert the vcpu if its deadline is updated */
>> > > > +        __q_remove(svc);
>> > > > +        __runq_insert(ops, svc);
>> > > >
>> > > One more proof of what I was trying to say. Is it really this
>> > > handler's
>> > > job to --basically-- re-sort the runqueue? I don't think so.
>> > >
>> > > What is the specific situation that you are trying to handle like
>> > > this?
>> > >
>> > Right, if we want to count deadline misses, it could be done when a
>> > vcpu is
>> > picked. However, when selecting the most imminent "inter-release
>> > time" of
>> > all runnable vcpu, the head of the runq could be missing its
>> > deadline and
>> > the cur-deadline could be in the past. How do we handle this
>> > situation? We
>> > still need to scan the runq right?
>>
>> I think Dario's point is that:
>> All VCPUs on runq should still have remaining budget, so they should
>> not have come into the situation of replenishing their budget. So in
>> the replenishment handler, runq should not be called.
>>
> Exactly.
>
>> I agree the runq
>> should not be called to replenish the budget. But the top VCPU in the
>> runq may be the next earliest one that should get its budget
>> replenished.
>>
> With "the top VCPU in the runq" you mean the one that is running on a
> pCPU? No, I don't think you refer to that one since, as you said,
> running vCPUs are not in the runqueues.
>
> And then why it is only the first vCPU in the runqueue  you seem to
> care about. I appreciate it has the shortest deadline, but I can't tell
> if that's enough to assume we only need to care about it. I probably do
> not recall with 100% accuracy the details of the DS algorithm that you
> want to implement... In particular, whether or not a replenishment need
> to be programmed when the task/vcpu becomes running, so do feel free to
> summarize it here for me. :-)

Ah, you are right. I forgot that a VCPU with a large deadlien may have
an earliest replenish time in the future. My fault. :-( The replenish
time of the vcpu should be decided once the VCPU got running or once
the VCPU is waked up or once the VCPU got its current budget in the
current period. So the top VCPU in the runq actually should have its
replenish time decided when/before it's added into the runq, since it
has  to have some budget to stay in the runq.

I think you are correct and I like the design you describe later that
uses a separate "queue" to record the replenish time.

>
> In any case, whatever the algorithm says, what I'm proposing is
> something completely different and general enough that would,
> hopefully, make it easier to reason on things.
>
> Can we have, together with the timer, a list of replenishment events?

Sure! this is a good idea and it will be easier for us to include the
RM scheduling later, if needed. Reusing the runq and depeletedq will
make the RM scheduling policy cause many changes to the RTDS
scheduling framework.

> I'm talking about an actual list of entries, where each entry contains
> the following information:
>  - time at which the replenishment should happen
>  - amount of replenishment
>  - vcpu to be replenished
>
> The list will be kept in order of replenishment time, and the timer
> will always be programmed to fire at the most imminent replenishment
> time instant (which will correspond to the first entry in the list).
>
> When the timer fires, it picks up the first entry, takes it out of the
> list, does the replenishment and takes care of the effects of the
> replenishment itself (deadline update, preemption, runq re-insertion,
> moving from depletedq to runq), depending on what the status of the
> vcpu being replenished was.
>
> After doing all this, the timer reprograms itself to fire at the
> replenishment time of the next (which just became the new first) entry
> in the list.
>
> At least, this is the idea. Now, for the implementation:
>
>  1. instead of only checking the first entry, it's wise that the timer
>     handler start to walk the list, and, considering the current time
>     (what NOW() gives you) takes care of all entries whose
>     replenishment time has passed. This is to deal with cases where
>     the handler may fire a little bit later than expected, and more
>     than just one replenishment event should have occurred already.
>     Since the list is in order, it is ok to stop as soon as the first
>     entry with a replenishment time which is actually in the future is
>     found;
>
>  2. embedded lists give us the opportunity to place one data structure
>     in more than one list/queue. So, instead of actually dynamically
>     allocating and using a dedicated data structure for each
>     replenishment event, I think we can:
>      - just add another embedded list element in rt_vcpu (just like
>        q_elem),
>      - use that to queue the rt_vcpu-sin the replenishment events list
>      - add the fields necessary to handle replenishment directly in
>        rt_vcpu (assuming we need any... If next replenishment time is
>        the next absolute deadline and amount is the budget, everything
>        we need should already be there)
>
> So.. What do you think?

It's nice. Thank you so much for drafting this. :-)
What do you think, Tianyang?

>
>> > This discussion was before I figured out things about idle_vcpu[]
>> > and
>> > tasklet. A vcpu could be preempted and put back to either runq or
>> > depletedq
>> > if a tasklet is scheduled. It could also end up in a depletedq in
>> > other
>> > situations. I guess Meng's point is this vcpu should be running
>> > constantly
>> > without being taken off if there is no tasklet, in an effort to
>> > follow EDF.
>> Hi Tianyang,
>>
>> I think Dario made a good point. In order to avoid vcpu being taken
>> off from the core, we can handle it in the rt_schedule(), the budget
>> enforcement function. This could probably make the code cleaner.
>>
> Exactly. Budget enforcement is perfectly fine being done in
> rt_schedule().
>
>> > > >
>> > > > +
>> > > > +    /* if timer was triggered but none of the vcpus
>> > > > +     * need to be refilled, set the timer to be the
>> > > > +     * default period + now
>> > > > +     */
>> > > > +    if(min_repl == LONG_MAX)
>> > > > +    {
>> > > > +        set_timer(&repl_timer, now + RTDS_DEFAULT_PERIOD);
>> > > >
>> > > I agree with Meng's point in this thread: this should not be
>> > > necessary.
>> > > If it is, it's most likely because of a bug or to something else.
>> > >
>> > > Let's figure out what it is, and fix it properly. (I see that in
>> > > v3
>> > > this logic is gone, so hopefully you found and fixed the issue
>> > > already.)
>> > >
>> > Yeah. Like I said the timer is originally programmed to fire when
>> > the first
>> > vcpu is inserted but all vcpus are not runnable at the beginning of
>> > boot
>> > process. If the timer is triggered before any vcpu wakes up, there
>> > is
>> > nothing on queue at all. This should be fixed with wake() in V3.
>>
>> Great! I'm wondering if we should look at the v3 version, which
>> should
>> have fixed many issues? We can decide if the runningq is needed in
>> v3.
>>
> The fact that v3 added runningq is the reason why I'm commenting on
> this version: I don't think it's necessary at all, and it was easier to
> focus on other issues with that out of the way.
>
> What I've seen in v3 seems ok to me. I can take another look, but I
> guess the best thing to do would be to put together a v4 with the fixes
> to the issue v2 had taken from v3, the runningq taken away, and the
> replenished queue implemented as I described (if you agree with it).

I see the point. Tianyang, what do you think? Maybe we can just put
together a v4 quickly and let Dario comment on v4?

>
>> Dario, What do you think? Right now, I'm kind of lost how we should
>> proceed next step? Should we modify based on this version or the
>> latest posted version v3?
>>
> As you wish, but I'd base the new version on this version, and
> "backport" good stuff from v3 to here (again, as it doesn't have the
> runningq in the way)

I see. Tianyang, let's do what Dario suggests. Actually, the code
changed from v2 to v3 will be a useful knowledge when we are working
on the v4. What do you think? or we can talk in person today. :-)

Thank you very much for your time on this, Dario!

Best regards,

Meng
Tianyang Chen Jan. 29, 2016, 10:40 p.m. UTC | #12
On 1/26/2016 12:28 PM, Meng Xu wrote:
> Hi Dario and Tianyang,
>
> On Tue, Jan 26, 2016 at 9:06 AM, Dario Faggioli
> <dario.faggioli@citrix.com> wrote:
>> On Mon, 2016-01-25 at 18:00 -0500, Meng Xu wrote:
>>> On Mon, Jan 25, 2016 at 5:04 PM, Tianyang Chen <tiche@seas.upenn.edu>
>>> wrote:
>>>> I have removed some of the Ccs so they won't get bothered as we
>>>> discussed
>>>> previously.
>>>>
>>>> On 1/25/2016 4:00 AM, Dario Faggioli wrote:
>>>>>
>>>>> On Thu, 2015-12-31 at 05:20 -0500, Tianyang Chen wrote:
>>>>>>
>>>>>>
>>>>>> @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops;
>>>>>>     * Global lock is referenced by schedule_data.schedule_lock
>>>>>> from all
>>>>>>     * physical cpus. It can be grabbed via
>>>>>> vcpu_schedule_lock_irq()
>>>>>>     */
>>>>>> +
>>>>>> +/* dedicated timer for replenishment */
>>>>>> +static struct timer repl_timer;
>>>>>> +
>>>>>
>>>>> So, there's always only one timer... Even if we have multiple
>>>>> cpupool
>>>>> with RTDS as their scheduler, they share the replenishment timer?
>>>>> I
>>>>> think it makes more sense to make this per-scheduler.
>>>>>
>>>> Yeah, I totally ignored the case for cpu-pools. It looks like when
>>>> a
>>>> cpu-pool is created, it copies the scheduler struct and calls
>>>> rt_init()
>>>> where a private field is initialized. So I assume the timer should
>>>> be put
>>>> inside the scheduler private struct? Now that I think about it, the
>>>> timer is
>>>> hard-coded to run on cpu0. If there're lots of cpu-pools but the
>>>> replenishment can only be done on the same pcpu, would that be a
>>>> problem?
>>>> Should we keep track of all instances of schedulers (nr_rt_ops
>>>> counts how
>>>> many) and just put times on different pcpus?
>>>>
>>>>>> +/* controls when to first start the timer*/
>>>>>> +static int timer_started;
>>>>>> +
>>>>>
>>>>> I don't like this, and I don't think we need it. In fact, you
>>>>> removed
>>>>> it yourself from v3, AFAICT.
>>>>>
>>>>>> @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler
>>>>>> *ops,
>>>>>> struct vcpu *vc)
>>>>>>
>>>>>>        /* add rt_vcpu svc to scheduler-specific vcpu list of the
>>>>>> dom */
>>>>>>        list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
>>>>>> +
>>>>>> +    if(!timer_started)
>>>>>> +    {
>>>>>> +        /* the first vcpu starts the timer for the first
>>>>>> time*/
>>>>>> +        timer_started = 1;
>>>>>> +        set_timer(&repl_timer,svc->cur_deadline);
>>>>>> +    }
>>>>>>    }
>>>>>>
>>>>> This also seems to be gone in v3, which is good. In fact, it uses
>>>>> timer_started, which I already said I didn't like.
>>>>>
>>>>> About the actual startup of the timer (no matter whether for
>>>>> first time
>>>>> or not). Here, you were doing it in _vcpu_insert() and not in
>>>>> _vcpu_wake(); in v3 you're doing it in _vcpu_wake() and not in
>>>>> _runq_insert()... Which one is the proper way?
>>>>>
>>>>
>>>> Correct me if I'm wrong, at the beginning of the boot process, all
>>>> vcpus are
>>>> put to sleep/not_runnable after insertions. Therefore, the timer
>>>> should
>>>> start when the first vcpu wakes up. I think the wake() in v3 should
>>>> be
>>>> correct.
>>>>
>>>>
>>>>>> @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops,
>>>>>> const
>>>>>> cpumask_t *mask)
>>>>>>    }
>>>>>>
>>>>>>    /*
>>>>>> - * Update vcpu's budget and
>>>>>> - * sort runq by insert the modifed vcpu back to runq
>>>>>> - * lock is grabbed before calling this function
>>>>>> - */
>>>>>> -static void
>>>>>> -__repl_update(const struct scheduler *ops, s_time_t now)
>>>>>> -{
>>>>>>
>>>>> Please, allow me to say that seeing this function going away,
>>>>> fills my
>>>>> heart with pure joy!! :-D
>>>>>
>>>>>> @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops,
>>>>>> s_time_t
>>>>>> now, bool_t tasklet_work_sched
>>>>>>            }
>>>>>>        }
>>>>>>
>>>>>> -    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched
>>>>>> quantum */
>>>>>> +    ret.time = snext->budget; /* invoke the scheduler next
>>>>>> time */
>>>>>>        ret.task = snext->vcpu;
>>>>>>
>>>>> This is ok as it is done in v3 (i.e., snext->budget if !idle, -1
>>>>> if
>>>>> idle).
>>>>>
>>>>>> @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler
>>>>>> *ops,
>>>>>> struct vcpu *vc)
>>>>>>        /* insert svc to runq/depletedq because svc is not in
>>>>>> queue now
>>>>>> */
>>>>>>        __runq_insert(ops, svc);
>>>>>>
>>>>>> -    __repl_update(ops, now);
>>>>>> -
>>>>>> -    ASSERT(!list_empty(&prv->sdom));
>>>>>> -    sdom = list_entry(prv->sdom.next, struct rt_dom,
>>>>>> sdom_elem);
>>>>>> -    online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
>>>>>> -    snext = __runq_pick(ops, online); /* pick snext from ALL
>>>>>> valid
>>>>>> cpus */
>>>>>> -
>>>>>> -    runq_tickle(ops, snext);
>>>>>> +    runq_tickle(ops, svc);
>>>>>>
>>>>> And this is another thing I especially like of this patch: it
>>>>> makes the
>>>>> wakeup path a lot simpler and a lot more similar to how it looks
>>>>> like
>>>>> in the other schedulers.
>>>>>
>>>>> Good job with this. :-)
>>>>>
>>>>>> @@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler
>>>>>> *ops,
>>>>>> struct vcpu *vc)
>>>>>>        if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc-
>>>>>>> flags) &&
>>>>>>             likely(vcpu_runnable(vc)) )
>>>>>>        {
>>>>>> +        /* only insert the pre-empted vcpu back*/
>>>>>>            __runq_insert(ops, svc);
>>>>>> -        __repl_update(ops, NOW());
>>>>>> -
>>>>>> -        ASSERT(!list_empty(&prv->sdom));
>>>>>> -        sdom = list_entry(prv->sdom.next, struct rt_dom,
>>>>>> sdom_elem);
>>>>>> -        online = cpupool_scheduler_cpumask(sdom->dom-
>>>>>>> cpupool);
>>>>>> -        snext = __runq_pick(ops, online); /* pick snext from
>>>>>> ALL
>>>>>> cpus */
>>>>>> -
>>>>>> -        runq_tickle(ops, snext);
>>>>>>        }
>>>>>
>>>>> Mmm... I'll think about this more and let you know... But out of
>>>>> the
>>>>> top of my head, I think the tickling has to stay? You preempted a
>>>>> vcpu
>>>>> from the pcpu where it was running, maybe some other pcpu is
>>>>> either
>>>>> idle or running a vcpu with a later deadline, and should come and
>>>>> pick
>>>>> this one up?
>>>
>>> If that's the case, why should we preempt this VCPU? We should use
>>> the
>>> top VCPu in the runq to preempt the lowest priority VCPU, right?
>>>
>> Yeah, in theory, and as far as things goes by "just" looking at
>> runq_tickle() in sched_rt.c, you're right.
>>
>> Tickling is (like everything in life :-P) not perfect, though. At least
>> it's not "instantaneously" that a tickled pcpu come and pick up work...
>> Something may happen that perturbates the scenario one depicted in his
>> head when thinking at what will happen after tickling (e.g., the pcpu
>> being tickled may be doing something which can't be interrupted for a
>> while).
>>
>> The scheduler should be robust, and don't explode or don't exhibit
>> wrong behavior in case, and I think the current code is ok in that
>> sense.
>>
>> My idea was that adding one more "tickling point" would make it more
>> robust, exactly in that regard, i.e., in tolerating anomalies due to
>> tickling resulting in the pcpu that picked up the work was a different
>> one from what we expected.
>>
>> But this indeed introduces more overhead... So, I agree, let's not do
>> that and see if we encounter problems. We'll come back here if we do.
>> :-)
>
> I see the point. OK. We can do some test to see when the extra
> tickling point should be used by adding some temporary warning print
> in the code and see if it's called and if we can avoid it. If it's not
> called too frequently, it may be ok, (which I'm not so sure), that we
> use extra tickle for it. (Maybe have some fast path in the tickle for
> the common cases.)
>
>>
>>>> gEDF allows this but there is overhead and may not be worth it. I
>>>> have no
>>>> stats to support this but there are some papers on restricting what
>>>> tasks
>>>> can migrate. We can discuss more if we need extra logic here.
>>>
>>> As to gEDF, the scheduling policy does not restrict what tasks can
>>> migrate, except for the VCPU's hard affinity set by users.  So
>>> migrating is an option. but we should avoid the unnecessary
>>> preemption/migration.
>>>
>> Agreed.
>>
>>>>>> +        if( min_repl> svc->cur_deadline )
>>>>>> +        {
>>>>>> +            min_repl = svc->cur_deadline;
>>>>>> +        }
>>>>>> +        /* reinsert the vcpu if its deadline is updated */
>>>>>> +        __q_remove(svc);
>>>>>> +        __runq_insert(ops, svc);
>>>>>>
>>>>> One more proof of what I was trying to say. Is it really this
>>>>> handler's
>>>>> job to --basically-- re-sort the runqueue? I don't think so.
>>>>>
>>>>> What is the specific situation that you are trying to handle like
>>>>> this?
>>>>>
>>>> Right, if we want to count deadline misses, it could be done when a
>>>> vcpu is
>>>> picked. However, when selecting the most imminent "inter-release
>>>> time" of
>>>> all runnable vcpu, the head of the runq could be missing its
>>>> deadline and
>>>> the cur-deadline could be in the past. How do we handle this
>>>> situation? We
>>>> still need to scan the runq right?
>>>
>>> I think Dario's point is that:
>>> All VCPUs on runq should still have remaining budget, so they should
>>> not have come into the situation of replenishing their budget. So in
>>> the replenishment handler, runq should not be called.
>>>
>> Exactly.
>>
>>> I agree the runq
>>> should not be called to replenish the budget. But the top VCPU in the
>>> runq may be the next earliest one that should get its budget
>>> replenished.
>>>
>> With "the top VCPU in the runq" you mean the one that is running on a
>> pCPU? No, I don't think you refer to that one since, as you said,
>> running vCPUs are not in the runqueues.
>>
>> And then why it is only the first vCPU in the runqueue  you seem to
>> care about. I appreciate it has the shortest deadline, but I can't tell
>> if that's enough to assume we only need to care about it. I probably do
>> not recall with 100% accuracy the details of the DS algorithm that you
>> want to implement... In particular, whether or not a replenishment need
>> to be programmed when the task/vcpu becomes running, so do feel free to
>> summarize it here for me. :-)
>
> Ah, you are right. I forgot that a VCPU with a large deadlien may have
> an earliest replenish time in the future. My fault. :-( The replenish
> time of the vcpu should be decided once the VCPU got running or once
> the VCPU is waked up or once the VCPU got its current budget in the
> current period. So the top VCPU in the runq actually should have its
> replenish time decided when/before it's added into the runq, since it
> has  to have some budget to stay in the runq.
>
> I think you are correct and I like the design you describe later that
> uses a separate "queue" to record the replenish time.
>
>>
>> In any case, whatever the algorithm says, what I'm proposing is
>> something completely different and general enough that would,
>> hopefully, make it easier to reason on things.
>>
>> Can we have, together with the timer, a list of replenishment events?
>
> Sure! this is a good idea and it will be easier for us to include the
> RM scheduling later, if needed. Reusing the runq and depeletedq will
> make the RM scheduling policy cause many changes to the RTDS
> scheduling framework.
>
>> I'm talking about an actual list of entries, where each entry contains
>> the following information:
>>   - time at which the replenishment should happen
>>   - amount of replenishment
>>   - vcpu to be replenished
>>
>> The list will be kept in order of replenishment time, and the timer
>> will always be programmed to fire at the most imminent replenishment
>> time instant (which will correspond to the first entry in the list).
>>
>> When the timer fires, it picks up the first entry, takes it out of the
>> list, does the replenishment and takes care of the effects of the
>> replenishment itself (deadline update, preemption, runq re-insertion,
>> moving from depletedq to runq), depending on what the status of the
>> vcpu being replenished was.
>>
>> After doing all this, the timer reprograms itself to fire at the
>> replenishment time of the next (which just became the new first) entry
>> in the list.
>>
>> At least, this is the idea. Now, for the implementation:
>>
>>   1. instead of only checking the first entry, it's wise that the timer
>>      handler start to walk the list, and, considering the current time
>>      (what NOW() gives you) takes care of all entries whose
>>      replenishment time has passed. This is to deal with cases where
>>      the handler may fire a little bit later than expected, and more
>>      than just one replenishment event should have occurred already.
>>      Since the list is in order, it is ok to stop as soon as the first
>>      entry with a replenishment time which is actually in the future is
>>      found;
>>
>>   2. embedded lists give us the opportunity to place one data structure
>>      in more than one list/queue. So, instead of actually dynamically
>>      allocating and using a dedicated data structure for each
>>      replenishment event, I think we can:
>>       - just add another embedded list element in rt_vcpu (just like
>>         q_elem),
>>       - use that to queue the rt_vcpu-sin the replenishment events list
>>       - add the fields necessary to handle replenishment directly in
>>         rt_vcpu (assuming we need any... If next replenishment time is
>>         the next absolute deadline and amount is the budget, everything
>>         we need should already be there)
>>
>> So.. What do you think?
>
> It's nice. Thank you so much for drafting this. :-)
> What do you think, Tianyang?
>
Dario and Meng,

This indeed makes it very clear and it looks like the timer handler 
doesn't even need to know where a vcpu is at all. Just plain 
replenishment when it needs it. It also gets rid off extra code in 
context_saved().

I'm assuming a vcpu should be added to the replq when it wakes up and 
removed when it's not runnable (checks in timer handler).
>>
>>>> This discussion was before I figured out things about idle_vcpu[]
>>>> and
>>>> tasklet. A vcpu could be preempted and put back to either runq or
>>>> depletedq
>>>> if a tasklet is scheduled. It could also end up in a depletedq in
>>>> other
>>>> situations. I guess Meng's point is this vcpu should be running
>>>> constantly
>>>> without being taken off if there is no tasklet, in an effort to
>>>> follow EDF.
>>> Hi Tianyang,
>>>
>>> I think Dario made a good point. In order to avoid vcpu being taken
>>> off from the core, we can handle it in the rt_schedule(), the budget
>>> enforcement function. This could probably make the code cleaner.
>>>
>> Exactly. Budget enforcement is perfectly fine being done in
>> rt_schedule().
>>
>>>>>>
>>>>>> +
>>>>>> +    /* if timer was triggered but none of the vcpus
>>>>>> +     * need to be refilled, set the timer to be the
>>>>>> +     * default period + now
>>>>>> +     */
>>>>>> +    if(min_repl == LONG_MAX)
>>>>>> +    {
>>>>>> +        set_timer(&repl_timer, now + RTDS_DEFAULT_PERIOD);
>>>>>>
>>>>> I agree with Meng's point in this thread: this should not be
>>>>> necessary.
>>>>> If it is, it's most likely because of a bug or to something else.
>>>>>
>>>>> Let's figure out what it is, and fix it properly. (I see that in
>>>>> v3
>>>>> this logic is gone, so hopefully you found and fixed the issue
>>>>> already.)
>>>>>
>>>> Yeah. Like I said the timer is originally programmed to fire when
>>>> the first
>>>> vcpu is inserted but all vcpus are not runnable at the beginning of
>>>> boot
>>>> process. If the timer is triggered before any vcpu wakes up, there
>>>> is
>>>> nothing on queue at all. This should be fixed with wake() in V3.
>>>
>>> Great! I'm wondering if we should look at the v3 version, which
>>> should
>>> have fixed many issues? We can decide if the runningq is needed in
>>> v3.
>>>
>> The fact that v3 added runningq is the reason why I'm commenting on
>> this version: I don't think it's necessary at all, and it was easier to
>> focus on other issues with that out of the way.
>>
>> What I've seen in v3 seems ok to me. I can take another look, but I
>> guess the best thing to do would be to put together a v4 with the fixes
>> to the issue v2 had taken from v3, the runningq taken away, and the
>> replenished queue implemented as I described (if you agree with it).
>
> I see the point. Tianyang, what do you think? Maybe we can just put
> together a v4 quickly and let Dario comment on v4?
>
>>
>>> Dario, What do you think? Right now, I'm kind of lost how we should
>>> proceed next step? Should we modify based on this version or the
>>> latest posted version v3?
>>>
>> As you wish, but I'd base the new version on this version, and
>> "backport" good stuff from v3 to here (again, as it doesn't have the
>> runningq in the way)
>
> I see. Tianyang, let's do what Dario suggests. Actually, the code
> changed from v2 to v3 will be a useful knowledge when we are working
> on the v4. What do you think? or we can talk in person today. :-)
>
> Thank you very much for your time on this, Dario!
Yeah definitely. I am still trying to figure out why there is an 
assertion failure at free_pdata() when I remove a pcpu from a pool... It 
that takes too long I will just send v4 out for discussion first.

Thanks for your time Dario.

Tianyang
diff mbox

Patch

diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
index 4372486..d522272 100644
--- a/xen/common/sched_rt.c
+++ b/xen/common/sched_rt.c
@@ -16,6 +16,7 @@ 
 #include <xen/delay.h>
 #include <xen/event.h>
 #include <xen/time.h>
+#include <xen/timer.h>
 #include <xen/perfc.h>
 #include <xen/sched-if.h>
 #include <xen/softirq.h>
@@ -147,6 +148,16 @@  static unsigned int nr_rt_ops;
  * Global lock is referenced by schedule_data.schedule_lock from all
  * physical cpus. It can be grabbed via vcpu_schedule_lock_irq()
  */
+
+/* dedicated timer for replenishment */
+static struct timer repl_timer;
+
+/* controls when to first start the timer*/
+static int timer_started;
+
+/* handler for the replenishment timer */
+static void repl_handler(void *data); 
+
 struct rt_private {
     spinlock_t lock;            /* the global coarse grand lock */
     struct list_head sdom;      /* list of availalbe domains, used for dump */
@@ -426,6 +437,7 @@  __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
 static int
 rt_init(struct scheduler *ops)
 {
+    const int cpu = smp_processor_id(); 
     struct rt_private *prv = xzalloc(struct rt_private);
 
     printk("Initializing RTDS scheduler\n"
@@ -454,6 +466,8 @@  rt_init(struct scheduler *ops)
 
     ops->sched_data = prv;
 
+    init_timer(&repl_timer, repl_handler, ops, 0);
+
     return 0;
 
  no_mem:
@@ -473,6 +487,9 @@  rt_deinit(const struct scheduler *ops)
         xfree(_cpumask_scratch);
         _cpumask_scratch = NULL;
     }
+
+    kill_timer(&repl_timer);
+
     xfree(prv);
 }
 
@@ -635,6 +652,13 @@  rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
 
     /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */
     list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu);
+
+    if(!timer_started)
+    {   
+        /* the first vcpu starts the timer for the first time*/
+        timer_started = 1;
+        set_timer(&repl_timer,svc->cur_deadline);
+    }
 }
 
 /*
@@ -792,44 +816,6 @@  __runq_pick(const struct scheduler *ops, const cpumask_t *mask)
 }
 
 /*
- * Update vcpu's budget and
- * sort runq by insert the modifed vcpu back to runq
- * lock is grabbed before calling this function
- */
-static void
-__repl_update(const struct scheduler *ops, s_time_t now)
-{
-    struct list_head *runq = rt_runq(ops);
-    struct list_head *depletedq = rt_depletedq(ops);
-    struct list_head *iter;
-    struct list_head *tmp;
-    struct rt_vcpu *svc = NULL;
-
-    list_for_each_safe(iter, tmp, runq)
-    {
-        svc = __q_elem(iter);
-        if ( now < svc->cur_deadline )
-            break;
-
-        rt_update_deadline(now, svc);
-        /* reinsert the vcpu if its deadline is updated */
-        __q_remove(svc);
-        __runq_insert(ops, svc);
-    }
-
-    list_for_each_safe(iter, tmp, depletedq)
-    {
-        svc = __q_elem(iter);
-        if ( now >= svc->cur_deadline )
-        {
-            rt_update_deadline(now, svc);
-            __q_remove(svc); /* remove from depleted queue */
-            __runq_insert(ops, svc); /* add to runq */
-        }
-    }
-}
-
-/*
  * schedule function for rt scheduler.
  * The lock is already grabbed in schedule.c, no need to lock here
  */
@@ -848,7 +834,6 @@  rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
     /* burn_budget would return for IDLE VCPU */
     burn_budget(ops, scurr, now);
 
-    __repl_update(ops, now);
 
     if ( tasklet_work_scheduled )
     {
@@ -889,7 +874,7 @@  rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched
         }
     }
 
-    ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */
+    ret.time = snext->budget; /* invoke the scheduler next time */
     ret.task = snext->vcpu;
 
     /* TRACE */
@@ -1033,10 +1018,6 @@  rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
 {
     struct rt_vcpu * const svc = rt_vcpu(vc);
     s_time_t now = NOW();
-    struct rt_private *prv = rt_priv(ops);
-    struct rt_vcpu *snext = NULL; /* highest priority on RunQ */
-    struct rt_dom *sdom = NULL;
-    cpumask_t *online;
 
     BUG_ON( is_idle_vcpu(vc) );
 
@@ -1074,14 +1055,7 @@  rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
     /* insert svc to runq/depletedq because svc is not in queue now */
     __runq_insert(ops, svc);
 
-    __repl_update(ops, now);
-
-    ASSERT(!list_empty(&prv->sdom));
-    sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
-    online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
-    snext = __runq_pick(ops, online); /* pick snext from ALL valid cpus */
-
-    runq_tickle(ops, snext);
+    runq_tickle(ops, svc);
 
     return;
 }
@@ -1094,10 +1068,6 @@  static void
 rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
 {
     struct rt_vcpu *svc = rt_vcpu(vc);
-    struct rt_vcpu *snext = NULL;
-    struct rt_dom *sdom = NULL;
-    struct rt_private *prv = rt_priv(ops);
-    cpumask_t *online;
     spinlock_t *lock = vcpu_schedule_lock_irq(vc);
 
     clear_bit(__RTDS_scheduled, &svc->flags);
@@ -1108,15 +1078,8 @@  rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
     if ( test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
          likely(vcpu_runnable(vc)) )
     {
+        /* only insert the pre-empted vcpu back*/
         __runq_insert(ops, svc);
-        __repl_update(ops, NOW());
-
-        ASSERT(!list_empty(&prv->sdom));
-        sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem);
-        online = cpupool_scheduler_cpumask(sdom->dom->cpupool);
-        snext = __runq_pick(ops, online); /* pick snext from ALL cpus */
-
-        runq_tickle(ops, snext);
     }
 out:
     vcpu_schedule_unlock_irq(lock, vc);
@@ -1167,6 +1130,74 @@  rt_dom_cntl(
     return rc;
 }
 
+static void repl_handler(void *data){
+    unsigned long flags;
+    s_time_t now = NOW();
+    s_time_t min_repl = LONG_MAX; /* max time used in comparison*/
+    struct scheduler *ops = data; 
+    struct rt_private *prv = rt_priv(ops);
+    struct list_head *runq = rt_runq(ops);
+    struct list_head *depletedq = rt_depletedq(ops);
+    struct list_head *iter;
+    struct list_head *tmp;
+    struct rt_vcpu *svc = NULL;
+
+    spin_lock_irqsave(&prv->lock,flags);
+
+    stop_timer(&repl_timer);
+
+    list_for_each_safe(iter, tmp, runq)
+    {
+        svc = __q_elem(iter);
+        if ( now < svc->cur_deadline )
+            break;
+        rt_update_deadline(now, svc);
+        /* scan the runq to find the min release time 
+         * this happens when vcpus on runq miss deadline
+         */
+        if( min_repl> svc->cur_deadline )
+        {
+            min_repl = svc->cur_deadline;
+        }
+        /* reinsert the vcpu if its deadline is updated */
+        __q_remove(svc);
+        __runq_insert(ops, svc);
+    }
+
+    list_for_each_safe(iter, tmp, depletedq)
+    {
+        svc = __q_elem(iter);
+        if ( now >= svc->cur_deadline )
+        {
+            rt_update_deadline(now, svc);
+
+            /* scan the delp_q to find the minimal release time */
+            if(min_repl > svc->cur_deadline)
+            {    
+                min_repl = svc->cur_deadline;
+            }
+            __q_remove(svc);
+            __runq_insert(ops, svc);
+            runq_tickle(ops, svc);
+        }
+    }
+
+    /* if timer was triggered but none of the vcpus
+     * need to be refilled, set the timer to be the   
+     * default period + now
+     */
+    if(min_repl == LONG_MAX)
+    {
+        set_timer(&repl_timer, now + RTDS_DEFAULT_PERIOD);
+    }
+    else
+    {
+        /* reprogram the timer using the most imminent release time*/
+        set_timer(&repl_timer, min_repl);
+    }
+    spin_unlock_irqrestore(&prv->lock,flags);
+}
+
 static struct rt_private _rt_priv;
 
 const struct scheduler sched_rtds_def = {