diff mbox

[1/4] xen: credit2: implement utilization cap

Message ID 149692372627.9605.8252407697848997058.stgit@Solace.fritz.box (mailing list archive)
State New, archived
Headers show

Commit Message

Dario Faggioli June 8, 2017, 12:08 p.m. UTC
This commit implements the Xen part of the cap mechanism for
Credit2.

A cap is how much, in terms of % of physical CPU time, a domain
can execute at most.

For instance, a domain that must not use more than 1/4 of one
physical CPU, must have a cap of 25%; one that must not use more
than 1+1/2 of physical CPU time, must be given a cap of 150%.

Caps are per domain, so it is all a domain's vCPUs, cumulatively,
that will be forced to execute no more than the decided amount.

This is implemented by giving each domain a 'budget', and using
a (per-domain again) periodic timer. Values of budget and 'period'
are chosen so that budget/period is equal to the cap itself.

Budget is burned by the domain's vCPUs, in a similar way to how
credits are.

When a domain runs out of budget, its vCPUs can't run any longer.
They can gain, when the budget is replenishment by the timer, which
event happens once every period.

Blocking the vCPUs because of lack of budget happens by
means of a new (_VPF_parked) pause flag, so that, e.g.,
vcpu_runnable() still works. This is similar to what is
done in sched_rtds.c, as opposed to what happens in
sched_credit.c, where vcpu_pause() and vcpu_unpause()
(which means, among other things, more overhead).

Note that xenalyze and tools/xentrace/format are also modified,
to keep them updated with one modified event.

Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
---
Cc: George Dunlap <george.dunlap@eu.citrix.com>
Cc: Anshul Makkar <anshul.makkar@citrix.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Ian Jackson <ian.jackson@eu.citrix.com>
Cc: Wei Liu <wei.liu2@citrix.com>
---
 tools/xentrace/formats     |    2 
 tools/xentrace/xenalyze.c  |   10 +
 xen/common/sched_credit2.c |  470 +++++++++++++++++++++++++++++++++++++++++---
 xen/include/xen/sched.h    |    3 
 4 files changed, 445 insertions(+), 40 deletions(-)

Comments

Anshul Makkar June 12, 2017, 11:16 a.m. UTC | #1
On 08/06/2017 13:08, Dario Faggioli wrote:
> This commit implements the Xen part of the cap mechanism for
> Credit2.
>
> A cap is how much, in terms of % of physical CPU time, a domain
> can execute at most.
>
> For instance, a domain that must not use more than 1/4 of one
> physical CPU, must have a cap of 25%; one that must not use more
> than 1+1/2 of physical CPU time, must be given a cap of 150%.
>
> Caps are per domain, so it is all a domain's vCPUs, cumulatively,
> that will be forced to execute no more than the decided amount.
>
> This is implemented by giving each domain a 'budget', and using
> a (per-domain again) periodic timer. Values of budget and 'period'
> are chosen so that budget/period is equal to the cap itself.
>
> Budget is burned by the domain's vCPUs, in a similar way to how
> credits are.
>
> When a domain runs out of budget, its vCPUs can't run any longer.
if the vcpus of a domain have credit and if budget has run out, will the 
vcpus won't be scheduled.
> They can gain, when the budget is replenishment by the timer, which
> event happens once every period.
>
> Blocking the vCPUs because of lack of budget happens by
> means of a new (_VPF_parked) pause flag, so that, e.g.,
> vcpu_runnable() still works. This is similar to what is
> done in sched_rtds.c, as opposed to what happens in
> sched_credit.c, where vcpu_pause() and vcpu_unpause()
> (which means, among other things, more overhead).
>
> Note that xenalyze and tools/xentrace/format are also modified,
> to keep them updated with one modified event.
>
> Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
> ---
> Cc: George Dunlap <george.dunlap@eu.citrix.com>
> Cc: Anshul Makkar <anshul.makkar@citrix.com>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> Cc: Jan Beulich <jbeulich@suse.com>
> Cc: Ian Jackson <ian.jackson@eu.citrix.com>
> Cc: Wei Liu <wei.liu2@citrix.com>
> ---
>  tools/xentrace/formats     |    2
>  tools/xentrace/xenalyze.c  |   10 +
>  xen/common/sched_credit2.c |  470 +++++++++++++++++++++++++++++++++++++++++---
>  xen/include/xen/sched.h    |    3
>  4 files changed, 445 insertions(+), 40 deletions(-)
>
> diff --git a/tools/xentrace/formats b/tools/xentrace/formats
> index 8b31780..142b0cf 100644
> --- a/tools/xentrace/formats
> +++ b/tools/xentrace/formats
> @@ -51,7 +51,7 @@
>
>  0x00022201  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:tick
>  0x00022202  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:runq_pos       [ dom:vcpu = 0x%(1)08x, pos = %(2)d]
> -0x00022203  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:credit burn    [ dom:vcpu = 0x%(1)08x, credit = %(2)d, delta = %(3)d ]
> +0x00022203  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:credit burn    [ dom:vcpu = 0x%(1)08x, credit = %(2)d, budget = %(3)d, delta = %(4)d ]
>  0x00022204  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:credit_add
>  0x00022205  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:tickle_check   [ dom:vcpu = 0x%(1)08x, credit = %(2)d ]
>  0x00022206  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:tickle         [ cpu = %(1)d ]
> diff --git a/tools/xentrace/xenalyze.c b/tools/xentrace/xenalyze.c
> index fa608ad..c16c02d 100644
> --- a/tools/xentrace/xenalyze.c
> +++ b/tools/xentrace/xenalyze.c
> @@ -7680,12 +7680,14 @@ void sched_process(struct pcpu_info *p)
>              if(opt.dump_all) {
>                  struct {
>                      unsigned int vcpuid:16, domid:16;
> -                    int credit, delta;
> +                    int credit, budget, delta;
>                  } *r = (typeof(r))ri->d;
>
> -                printf(" %s csched2:burn_credits d%uv%u, credit = %d, delta = %d\n",
> -                       ri->dump_header, r->domid, r->vcpuid,
> -                       r->credit, r->delta);
> +                printf(" %s csched2:burn_credits d%uv%u, credit = %d, ",
> +                       ri->dump_header, r->domid, r->vcpuid, r->credit);
> +                if ( r->budget != INT_MIN )
> +                    printf("budget = %d, ", r->budget);
> +                printf("delta = %d\n", r->delta);
>              }
>              break;
>          case TRC_SCHED_CLASS_EVT(CSCHED2, 5): /* TICKLE_CHECK      */
> diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
> index 126417c..ba4bf4b 100644
> --- a/xen/common/sched_credit2.c
> +++ b/xen/common/sched_credit2.c
> @@ -92,6 +92,82 @@
>   */
>
>  /*
> + * Utilization cap:
> + *
> + * Setting an pCPU utilization cap for a domain means the following:
> + *
> + * - a domain can have a cap, expressed in terms of % of physical CPU time.
> + *   A domain that must not use more than 1/4 of _one_ physical CPU, will
> + *   be given a cap of 25%; a domain that must not use more than 1+1/2 of
> + *   physical CPU time, will be given a cap of 150%;
> + *
> + * - caps are per-domain (not per-vCPU). If a domain has only 1 vCPU, and
> + *   a 40% cap, that one vCPU will use 40% of one pCPU. If a somain has 4
> + *   vCPUs, and a 200% cap, all its 4 vCPUs are allowed to run for (the
> + *   equivalent of) 100% time on 2 pCPUs. How much each of the various 4
> + *   vCPUs will get, is unspecified (will depend on various aspects: workload,
> + *   system load, etc.).
or the ratio can vary eg. 4 vcpus are allowed to run got 50% of the time 
if on 4 pcpus ?
> + *
> + * For implementing this, we use the following approach:
> + *
> + * - each domain is given a 'budget', an each domain has a timer, which
> + *   replenishes the domain's budget periodically. The budget is the amount
> + *   of time the vCPUs of the domain can use every 'period';
> + *
> + * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for all domains
> + *   (but each domain has its own timer; so the all are periodic by the same
> + *   period, but replenishment of the budgets of the various domains, at
> + *   periods boundaries, are not synchronous);
> + *
> + * - when vCPUs run, they consume budget. When they don't run, they don't
> + *   consume budget. If there is no budget left for the domain, no vCPU of
> + *   that domain can run. If a vCPU tries to run and finds that there is no
> + *   budget, it blocks.
> + *   Budget never expires, so at whatever time a vCPU wants to run, it can
> + *   check the domain's budget, and if there is some, it can use it.
> + *
> + * - budget is replenished to the top of the capacity for the domain once
> + *   per period. Even if there was some leftover budget from previous period,
> + *   though, the budget after a replenishment will always be at most equal
> + *   to the total capacify of the domain ('tot_budget');
> + *
budget is replenished but credits not available ?
budget is finished but not vcpu has not reached the rate limit boundary ?

> + * - when a budget replenishment occurs, if there are vCPUs that had been
> + *   blocked because of lack of budget, they'll be unblocked, and they will
> + *   (potentially) be able to run again.
> + *
> + * Finally, some even more implementation related detail:
> + *
> + * - budget is stored in a domain-wide pool. vCPUs of the domain that want
> + *   to run go to such pool, and grub some. When they do so, the amount
> + *   they grabbed is _immediately_ removed from the pool. This happens in
> + *   vcpu_try_to_get_budget();
> + *
> + * - when vCPUs stop running, if they've not consumed all the budget they
> + *   took, the leftover is put back in the pool. This happens in
> + *   vcpu_give_budget_back();
200% budget, 4 vcpus to run on 4 pcpus each allowed only 50% of budget. 
This is a static allocation . for eg. 2 vcpus running on 2 pvpus at 20% 
budgeted time, if vcpu3 wants to execute some cpu intensive task, then 
it won't be allowed to allowed to use more than 50% of the pcpus.

I checked the implenation below and I believe we can allow for these 
type of dynamic budget_quota allocation per vcpu. Not for initial 
version, but certainly we can consider it for future versions.
> + *
> + * - the above means that a vCPU can find out that there is no budget and
> + *   block, not only if the cap has actually been reached (for this period),
> + *   but also if some other vCPUs, in order to run, have grabbed a certain
> + *   quota of budget, no matter whether they've already used it all or not.
> + *   A vCPU blocking because (any form of) lack of budget is said to be
> + *   "parked", and such blocking happens in park_vcpu();
> + *
> + * - when a vCPU stops running, and puts back some budget in the domain pool,
> + *   we need to check whether there is someone which has been parked and that
> + *   can be unparked. This happens in unpark_parked_vcpus(), called from
> + *   csched2_context_saved();
> + *
> + * - of course, unparking happens also as a consequene of the domain's budget
> + *   being replenished by the periodic timer. This also occurs by means of
> + *   calling csched2_context_saved() (but from repl_sdom_budget());
> + *
> + * - parked vCPUs of a domain are kept in a (per-domain) list, called
> + *   'parked_vcpus'). Manipulation of the list and of the domain-wide budget
> + *   pool, must occur only when holding the 'budget_lock'.
> + */
> +
> +/*
>   * Locking:
>   *
>   * - runqueue lock
> @@ -112,18 +188,29 @@
>   *     runqueue each cpu is;
>   *  + serializes the operation of changing the weights of domains;
>   *
> + * - Budget lock
> + *  + it is per-domain;
> + *  + protects, in domains that have an utilization cap;
> + *   * manipulation of the total budget of the domain (as it is shared
> + *     among all vCPUs of the domain),
> + *   * manipulation of the list of vCPUs that are blocked waiting for
> + *     some budget to be available.
> + *
>   * - Type:
>   *  + runqueue locks are 'regular' spinlocks;
>   *  + the private scheduler lock can be an rwlock. In fact, data
>   *    it protects is modified only during initialization, cpupool
>   *    manipulation and when changing weights, and read in all
> - *    other cases (e.g., during load balancing).
> + *    other cases (e.g., during load balancing);
> + *  + budget locks are 'regular' spinlocks.
>   *
>   * Ordering:
>   *  + tylock must be used when wanting to take a runqueue lock,
>   *    if we already hold another one;
>   *  + if taking both a runqueue lock and the private scheduler
> - *    lock is, the latter must always be taken for first.
> + *    lock is, the latter must always be taken for first;
> + *  + if taking both a runqueue lock and a budget lock, the former
> + *    must always be taken for first.
>   */
>
>  /*
> @@ -164,6 +251,8 @@
>  #define CSCHED2_CREDIT_RESET         0
>  /* Max timer: Maximum time a guest can be run for. */
>  #define CSCHED2_MAX_TIMER            CSCHED2_CREDIT_INIT
> +/* Period of the cap replenishment timer. */
> +#define CSCHED2_BDGT_REPL_PERIOD     ((opt_cap_period)*MILLISECS(1))
>
>  /*
>   * Flags
> @@ -293,6 +382,14 @@ static int __read_mostly opt_underload_balance_tolerance = 0;
>  integer_param("credit2_balance_under", opt_underload_balance_tolerance);
>  static int __read_mostly opt_overload_balance_tolerance = -3;
>  integer_param("credit2_balance_over", opt_overload_balance_tolerance);
> +/*
> + * Domains subject to a cap, receive a replenishment of their runtime budget
> + * once every opt_cap_period interval. Default is 10 ms. The amount of budget
> + * they receive depends on their cap. For instance, a domain with a 50% cap
> + * will receive 50% of 10 ms, so 5 ms.
> + */
> +static unsigned int __read_mostly opt_cap_period = 10;    /* ms */
> +integer_param("credit2_cap_period_ms", opt_cap_period);
>
>  /*
>   * Runqueue organization.
> @@ -408,6 +505,10 @@ struct csched2_vcpu {
>      unsigned int residual;
>
>      int credit;
> +
> +    s_time_t budget;
it's confusing, please can we have different member names for budget in 
domain and vcpu structure.
> +    struct list_head parked_elem;      /* On the parked_vcpus list   */
> +
>      s_time_t start_time; /* When we were scheduled (used for credit) */
>      unsigned flags;      /* 16 bits doesn't seem to play well with clear_bit() */
>      int tickled_cpu;     /* cpu tickled for picking us up (-1 if none) */
> @@ -425,7 +526,15 @@ struct csched2_vcpu {
>  struct csched2_dom {
>      struct list_head sdom_elem;
>      struct domain *dom;
> +
> +    spinlock_t budget_lock;
> +    struct timer repl_timer;
> +    s_time_t next_repl;
> +    s_time_t budget, tot_budget;
> +    struct list_head parked_vcpus;
> +
>      uint16_t weight;
> +    uint16_t cap;
>      uint16_t nr_vcpus;
>  };
>
> @@ -460,6 +569,12 @@ static inline struct csched2_runqueue_data *c2rqd(const struct scheduler *ops,
>      return &csched2_priv(ops)->rqd[c2r(ops, cpu)];
>  }
>
> +/* Does the domain of this vCPU have a cap? */
> +static inline bool has_cap(const struct csched2_vcpu *svc)
> +{
> +    return svc->budget != STIME_MAX;
> +}
> +
>  /*
>   * Hyperthreading (SMT) support.
>   *
> @@ -1354,7 +1469,16 @@ static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now,
>           * that the credit it has spent so far get accounted.
>           */
>          if ( svc->vcpu == curr_on_cpu(svc_cpu) )
> +        {
>              burn_credits(rqd, svc, now);
> +            /*
> +             * And, similarly, in case it has run out of budget, as a
> +             * consequence of this round of accounting, we also must inform
> +             * its pCPU that it's time to park it, and pick up someone else.
> +             */
> +            if ( unlikely(svc->budget <= 0) )
use of unlikely here is not saving much of cpu cycles.`
> +                tickle_cpu(svc_cpu, rqd);
> +        }
>
>          start_credit = svc->credit;
>
> @@ -1410,27 +1534,35 @@ void burn_credits(struct csched2_runqueue_data *rqd,
>
>      delta = now - svc->start_time;
>
> -    if ( likely(delta > 0) )
> -    {
> -        SCHED_STAT_CRANK(burn_credits_t2c);
> -        t2c_update(rqd, delta, svc);
> -        svc->start_time = now;
> -    }
> -    else if ( delta < 0 )
> +    if ( unlikely(delta <= 0) )
>      {
> -        d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" start_time %"PRI_stime"\n",
> -                 __func__, now, svc->start_time);
> +        if ( unlikely(delta < 0) )
> +            d2printk("WARNING: %s: Time went backwards? now %"PRI_stime
> +                     " start_time %"PRI_stime"\n", __func__, now,
> +                     svc->start_time);
> +        goto out;
>      }
>
> +    SCHED_STAT_CRANK(burn_credits_t2c);
> +    t2c_update(rqd, delta, svc);
> +
> +    if ( unlikely(svc->budget != STIME_MAX) )
not clear, what is this check about ?
> +        svc->budget -= delta;
> +
> +    svc->start_time = now;
> +
> + out:
>      if ( unlikely(tb_init_done) )
>      {
>          struct {
>              unsigned vcpu:16, dom:16;
> -            int credit, delta;
> +            int credit, budget;
> +            int delta;
>          } d;
>          d.dom = svc->vcpu->domain->domain_id;
>          d.vcpu = svc->vcpu->vcpu_id;
>          d.credit = svc->credit;
> +        d.budget = has_cap(svc) ?  svc->budget : INT_MIN;
>          d.delta = delta;
>          __trace_var(TRC_CSCHED2_CREDIT_BURN, 1,
>                      sizeof(d),
> @@ -1438,6 +1570,217 @@ void burn_credits(struct csched2_runqueue_data *rqd,
>      }
>  }
>
> +/*
> + * Budget-related code.
> + */
> +
> +static void park_vcpu(struct csched2_vcpu *svc)
> +{
> +    struct vcpu *v = svc->vcpu;
> +
> +    ASSERT(spin_is_locked(&svc->sdom->budget_lock));
> +
> +    /*
> +     * It was impossible to find budget for this vCPU, so it has to be
> +     * "parked". This implies it is not runnable, so we mark it as such in
> +     * its pause_flags. If the vCPU is currently scheduled (which means we
> +     * are here after being called from within csched_schedule()), flagging
> +     * is enough, as we'll choose someone else, and then context_saved()
> +     * will take care of updating the load properly.
> +     *
> +     * If, OTOH, the vCPU is sitting in the runqueue (which means we are here
> +     * after being called from within runq_candidate()), we must go all the
> +     * way down to taking it out of there, and updating the load accordingly.
> +     *
> +     * In both cases, we also add it to the list of parked vCPUs of the domain.
> +     */
> +    __set_bit(_VPF_parked, &v->pause_flags);
> +    if ( vcpu_on_runq(svc) )
> +    {
> +        runq_remove(svc);
> +        update_load(svc->sdom->dom->cpupool->sched, svc->rqd, svc, -1, NOW());
> +    }
> +    list_add(&svc->parked_elem, &svc->sdom->parked_vcpus);
> +}
> +
> +static bool vcpu_try_to_get_budget(struct csched2_vcpu *svc)
> +{
> +    struct csched2_dom *sdom = svc->sdom;
> +    unsigned int cpu = svc->vcpu->processor;
> +
> +    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
> +
> +    if ( svc->budget > 0 )
> +        return true;
> +
> +    /* budget_lock nests inside runqueue lock. */
> +    spin_lock(&sdom->budget_lock);
> +
> +    /*
> +     * Here, svc->budget is <= 0 (as, if it was > 0, we'd have taken the if
> +     * above!). That basically means the vCPU has overrun a bit --because of
> +     * various reasons-- and we want to take that into account. With the +=,
> +     * we are actually subtracting the amount of budget the vCPU has
> +     * overconsumed, from the total domain budget.
> +     */
> +    sdom->budget += svc->budget;
> +
> +    if ( sdom->budget > 0 )
> +    {
> +        svc->budget = sdom->budget;
why are you assigning the remaining sdom->budge to only this svc. svc 
should be assigned a proportionate budget.
Each vcpu is assigned a %age of the domain budget based on the cap and 
number of vcpus.
There is difference in the code that's here and the code in branch 
git://xenbits.xen.org/people/dariof/xen.git (fetch) 
rel/sched/credti2-caps branch. Logic in the branch code looks fine where 
you are taking svc->budget_quota into considration..
> +        sdom->budget = 0;
> +    }
> +    else
> +    {
> +        svc->budget = 0;
> +        park_vcpu(svc);
> +    }
> +
> +    spin_unlock(&sdom->budget_lock);
> +
> +    return svc->budget > 0;
> +}
> +
> +static void
> +vcpu_give_budget_back(struct csched2_vcpu *svc, struct list_head *parked)
> +{
> +    struct csched2_dom *sdom = svc->sdom;
> +    unsigned int cpu = svc->vcpu->processor;
> +
> +    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
> +    ASSERT(list_empty(parked));
> +
> +    /* budget_lock nests inside runqueue lock. */
> +    spin_lock(&sdom->budget_lock);
> +
> +    /*
> +     * The vCPU is stopping running (e.g., because it's blocking, or it has
> +     * been preempted). If it hasn't consumed all the budget it got when,
> +     * starting to run, put that remaining amount back in the domain's budget
> +     * pool.
> +     */
> +    sdom->budget += svc->budget;
> +    svc->budget = 0;
> +
> +    /*
> +     * Making budget available again to the domain means that parked vCPUs
> +     * may be unparked and run. They are, if any, in the domain's parked_vcpus
> +     * list, so we want to go through that and unpark them (so they can try
> +     * to get some budget).
> +     *
> +     * Touching the list requires the budget_lock, which we hold. Let's
> +     * therefore put everyone in that list in another, temporary list, which
> +     * then the caller will traverse, unparking the vCPUs it finds there.
> +     *
> +     * In fact, we can't do the actual unparking here, because that requires
> +     * taking the runqueue lock of the vCPUs being unparked, and we can't
> +     * take any runqueue locks while we hold a budget_lock.
> +     */
> +    if ( sdom->budget > 0 )
> +        list_splice_init(&sdom->parked_vcpus, parked);
> +
> +    spin_unlock(&sdom->budget_lock);
> +}
> +
> +static void
> +unpark_parked_vcpus(const struct scheduler *ops, struct list_head *vcpus)
> +{
> +    struct csched2_vcpu *svc, *tmp;
> +    spinlock_t *lock;
> +
> +    list_for_each_entry_safe(svc, tmp, vcpus, parked_elem)
> +    {
> +        unsigned long flags;
> +        s_time_t now;
> +
> +        lock = vcpu_schedule_lock_irqsave(svc->vcpu, &flags);
> +
> +        __clear_bit(_VPF_parked, &svc->vcpu->pause_flags);
> +        if ( unlikely(svc->flags & CSFLAG_scheduled) )
> +        {
> +            /*
> +             * We end here if a budget replenishment arrived between
> +             * csched2_schedule() (and, in particular, after a call to
> +             * vcpu_try_to_get_budget() that returned false), and
> +             * context_saved(). By setting __CSFLAG_delayed_runq_add,
> +             * we tell context_saved() to put the vCPU back in the
> +             * runqueue, from where it will compete with the others
> +             * for the newly replenished budget.
> +             */
> +            ASSERT( svc->rqd != NULL );
> +            ASSERT( c2rqd(ops, svc->vcpu->processor) == svc->rqd );
> +            __set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
> +        }
> +        else if ( vcpu_runnable(svc->vcpu) )
> +        {
> +            /*
> +             * The vCPU should go back to the runqueue, and compete for
> +             * the newly replenished budget, but only if it is actually
> +             * runnable (and was therefore offline only because of the
> +             * lack of budget).
> +             */
> +            now = NOW();
> +            update_load(ops, svc->rqd, svc, 1, now);
> +            runq_insert(ops, svc);
> +            runq_tickle(ops, svc, now);
> +        }
> +        list_del_init(&svc->parked_elem);
> +
> +        vcpu_schedule_unlock_irqrestore(lock, flags, svc->vcpu);
> +    }
> +}
> +
> +static void repl_sdom_budget(void* data)
> +{
> +    struct csched2_dom *sdom = data;
> +    unsigned long flags;
> +    s_time_t now;
> +    LIST_HEAD(parked);
> +
> +    spin_lock_irqsave(&sdom->budget_lock, flags);
> +
> +    /*
> +     * It is possible that the domain overrun, and that the budget hence went
> +     * below 0 (reasons may be system overbooking, issues in or too coarse
> +     * runtime accounting, etc.). In particular, if we overrun by more than
> +     * tot_budget, then budget+tot_budget would still be < 0, which in turn
> +     * means that, despite replenishment, there's still no budget for unarking
> +     * and running vCPUs.
> +     *
> +     * It is also possible that we are handling the replenishment much later
> +     * than expected (reasons may again be overbooking, or issues with timers).
> +     * If we are more than CSCHED2_BDGT_REPL_PERIOD late, this means we have
> +     * basically skipped (at least) one replenishment.
> +     *
> +     * We deal with both the issues here, by, basically, doing more than just
> +     * one replenishment. Note, however, that every time we add tot_budget
> +     * to the budget, we also move next_repl away by CSCHED2_BDGT_REPL_PERIOD.
> +     * This guarantees we always respect the cap.
> +     */
> +    now = NOW();
> +    do
> +    {
> +        sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD;
> +        sdom->budget += sdom->tot_budget;
> +    }
> +    while ( sdom->next_repl <= now || sdom->budget <= 0 );
> +    /* We may have done more replenishments: make sure we didn't overshot. */
> +    sdom->budget = min(sdom->budget, sdom->tot_budget);
> +
> +    /*
> +     * As above, let's prepare the temporary list, out of the domain's
> +     * parked_vcpus list, now that we hold the budget_lock. Then, drop such
> +     * lock, and pass the list to the unparking function.
> +     */
> +    list_splice_init(&sdom->parked_vcpus, &parked);
> +
> +    spin_unlock_irqrestore(&sdom->budget_lock, flags);
> +
> +    unpark_parked_vcpus(sdom->dom->cpupool->sched, &parked);
> +
> +    set_timer(&sdom->repl_timer, sdom->next_repl);
> +}
> +
>  #ifndef NDEBUG
>  static inline void
>  csched2_vcpu_check(struct vcpu *vc)
> @@ -1497,6 +1840,9 @@ csched2_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
>      }
>      svc->tickled_cpu = -1;
>
> +    svc->budget = STIME_MAX;
> +    INIT_LIST_HEAD(&svc->parked_elem);
> +
>      SCHED_STAT_CRANK(vcpu_alloc);
>
>      return svc;
> @@ -1593,6 +1939,7 @@ csched2_context_saved(const struct scheduler *ops, struct vcpu *vc)
>      struct csched2_vcpu * const svc = csched2_vcpu(vc);
>      spinlock_t *lock = vcpu_schedule_lock_irq(vc);
>      s_time_t now = NOW();
> +    LIST_HEAD(were_parked);
>
>      BUG_ON( !is_idle_vcpu(vc) && svc->rqd != c2rqd(ops, vc->processor));
>      ASSERT(is_idle_vcpu(vc) || svc->rqd == c2rqd(ops, vc->processor));
> @@ -1600,6 +1947,9 @@ csched2_context_saved(const struct scheduler *ops, struct vcpu *vc)
>      /* This vcpu is now eligible to be put on the runqueue again */
>      __clear_bit(__CSFLAG_scheduled, &svc->flags);
>
> +    if ( unlikely(has_cap(svc) && svc->budget > 0) )
> +        vcpu_give_budget_back(svc, &were_parked);
> +
>      /* If someone wants it on the runqueue, put it there. */
>      /*
>       * NB: We can get rid of CSFLAG_scheduled by checking for
> @@ -1620,6 +1970,8 @@ csched2_context_saved(const struct scheduler *ops, struct vcpu *vc)
>          update_load(ops, svc->rqd, svc, -1, now);
>
>      vcpu_schedule_unlock_irq(lock, vc);
> +
> +    unpark_parked_vcpus(ops, &were_parked);
>  }
>
>  #define MAX_LOAD (STIME_MAX);
> @@ -2243,12 +2595,18 @@ csched2_alloc_domdata(const struct scheduler *ops, struct domain *dom)
>      if ( sdom == NULL )
>          return NULL;
>
> -    /* Initialize credit and weight */
> +    /* Initialize credit, cap and weight */
>      INIT_LIST_HEAD(&sdom->sdom_elem);
>      sdom->dom = dom;
>      sdom->weight = CSCHED2_DEFAULT_WEIGHT;
> +    sdom->cap = 0U;
>      sdom->nr_vcpus = 0;
>
> +    init_timer(&sdom->repl_timer, repl_sdom_budget, (void*) sdom,
> +               cpumask_any(cpupool_domain_cpumask(dom)));
> +    spin_lock_init(&sdom->budget_lock);
> +    INIT_LIST_HEAD(&sdom->parked_vcpus);
> +
>      write_lock_irqsave(&prv->lock, flags);
>
>      list_add_tail(&sdom->sdom_elem, &csched2_priv(ops)->sdom);
> @@ -2284,6 +2642,7 @@ csched2_free_domdata(const struct scheduler *ops, void *data)
>
>      write_lock_irqsave(&prv->lock, flags);
>
> +    kill_timer(&sdom->repl_timer);
>      list_del_init(&sdom->sdom_elem);
>
>      write_unlock_irqrestore(&prv->lock, flags);
> @@ -2378,11 +2737,12 @@ csched2_runtime(const struct scheduler *ops, int cpu,
>          return -1;
>
>      /* General algorithm:
> -     * 1) Run until snext's credit will be 0
> +     * 1) Run until snext's credit will be 0.
>       * 2) But if someone is waiting, run until snext's credit is equal
> -     * to his
> -     * 3) But never run longer than MAX_TIMER or shorter than MIN_TIMER or
> -     * the ratelimit time.
> +     *    to his.
> +     * 3) But, if we are capped, never run more than our budget.
> +     * 4) But never run longer than MAX_TIMER or shorter than MIN_TIMER or
> +     *    the ratelimit time.
>       */
>
>      /* Calculate mintime */
> @@ -2397,11 +2757,13 @@ csched2_runtime(const struct scheduler *ops, int cpu,
>              min_time = ratelimit_min;
>      }
>
> -    /* 1) Basic time: Run until credit is 0. */
> +    /* 1) Run until snext's credit will be 0. */
>      rt_credit = snext->credit;
>
> -    /* 2) If there's someone waiting whose credit is positive,
> -     * run until your credit ~= his */
> +    /*
> +     * 2) If there's someone waiting whose credit is positive,
> +     *    run until your credit ~= his.
> +     */
>      if ( ! list_empty(runq) )
>      {
>          struct csched2_vcpu *swait = runq_elem(runq->next);
> @@ -2423,14 +2785,22 @@ csched2_runtime(const struct scheduler *ops, int cpu,
>       * credit values of MIN,MAX per vcpu, since each vcpu burns credit
>       * at a different rate.
>       */
> -    if (rt_credit > 0)
> +    if ( rt_credit > 0 )
>          time = c2t(rqd, rt_credit, snext);
>      else
>          time = 0;
>
> -    /* 3) But never run longer than MAX_TIMER or less than MIN_TIMER or
> -     * the rate_limit time. */
> -    if ( time < min_time)
> +    /*
> +     * 3) But, if capped, never run more than our budget.
> +     */
> +    if ( unlikely(has_cap(snext)) )
> +        time = snext->budget < time ? snext->budget : time;
> +
does the budget takes precedence over rate and credit ?
please replace snext->budget with something which is less confusing eg 
snext->budget_allocated..
> +    /*
> +     * 4) But never run longer than MAX_TIMER or less than MIN_TIMER or
> +     *    the rate_limit time.
> +     */
> +    if ( time < min_time )
>      {
>          time = min_time;
>          SCHED_STAT_CRANK(runtime_min_timer);
> @@ -2447,13 +2817,13 @@ csched2_runtime(const struct scheduler *ops, int cpu,
>  /*
>   * Find a candidate.
>   */
> -static struct csched2_vcpu *
> +static noinline struct csched2_vcpu *
>  runq_candidate(struct csched2_runqueue_data *rqd,
>                 struct csched2_vcpu *scurr,
>                 int cpu, s_time_t now,
>                 unsigned int *skipped)
>  {
> -    struct list_head *iter;
> +    struct list_head *iter, *temp;
>      struct csched2_vcpu *snext = NULL;
>      struct csched2_private *prv = csched2_priv(per_cpu(scheduler, cpu));
>      bool yield = __test_and_clear_bit(__CSFLAG_vcpu_yield, &scurr->flags);
> @@ -2496,7 +2866,7 @@ runq_candidate(struct csched2_runqueue_data *rqd,
>      else
>          snext = csched2_vcpu(idle_vcpu[cpu]);
>
> -    list_for_each( iter, &rqd->runq )
> +    list_for_each_safe( iter, temp, &rqd->runq )
>      {
>          struct csched2_vcpu * svc = list_entry(iter, struct csched2_vcpu, runq_elem);
>
> @@ -2544,11 +2914,13 @@ runq_candidate(struct csched2_runqueue_data *rqd,
>          }
>
In runq candidate we have a code base
/*
  * Return the current vcpu if it has executed for less than ratelimit.
  * Adjuststment for the selected vcpu's credit and decision
  * for how long it will run will be taken in csched2_runtime.
  *
  * Note that, if scurr is yielding, we don't let rate limiting kick in.
  * In fact, it may be the case that scurr is about to spin, and there's
  * no point forcing it to do so until rate limiting expires.
  */
  if ( !yield && prv->ratelimit_us && !is_idle_vcpu(scurr->vcpu) &&
       vcpu_runnable(scurr->vcpu) &&
      (now - scurr->vcpu->runstate.state_entry_time) <
        MICROSECS(prv->ratelimit_us) )
In this codeblock we return scurr. Here there is no check for vcpu->budget.
Even if the scurr vcpu has executed for less than rate limit and scurr 
is not yielding, we need to check for its budget before returning scurr.


>          /*
> -         * If the next one on the list has more credit than current
> -         * (or idle, if current is not runnable), or if current is
> -         * yielding, choose it.
> +         * If the one in the runqueue has more credit than current (or idle,
> +         * if current is not runnable), or if current is yielding, and also
> +         * if the one in runqueue either is not capped, or is capped but has
> +         * some budget, then choose it.
>           */
> -        if ( yield || svc->credit > snext->credit )
> +        if ( (yield || svc->credit > snext->credit) &&
> +             (!has_cap(svc) || vcpu_try_to_get_budget(svc)) )
>              snext = svc;
>
>          /* In any case, if we got this far, break. */
> @@ -2575,6 +2947,13 @@ runq_candidate(struct csched2_runqueue_data *rqd,
>      if ( unlikely(snext->tickled_cpu != -1 && snext->tickled_cpu != cpu) )
>          SCHED_STAT_CRANK(tickled_cpu_overridden);
>
> +    /*
> +     * If snext is from a capped domain, it must have budget (or it
> +     * wouldn't have been in the runq). If it is not, it'd be STIME_MAX,
> +     * which still is >= 0.
> +     */
> +    ASSERT(snext->budget >= 0);
> +
>      return snext;
>  }
>
> @@ -2632,10 +3011,18 @@ csched2_schedule(
>                      (unsigned char *)&d);
>      }
>
> -    /* Update credits */
> +    /* Update credits (and budget, if necessary). */
>      burn_credits(rqd, scurr, now);
>
>      /*
> +     *  Below 0, means that we are capped and we have overrun our  budget.
> +     *  Let's try to get some more but, if we fail (e.g., because of the
> +     *  other running vcpus), we will be parked.
> +     */
> +    if ( unlikely(scurr->budget <= 0) )
> +        vcpu_try_to_get_budget(scurr);
> +
> +    /*
>       * Select next runnable local VCPU (ie top of local runq).
>       *
>       * If the current vcpu is runnable, and has higher credit than
> @@ -2769,6 +3156,9 @@ csched2_dump_vcpu(struct csched2_private *prv, struct csched2_vcpu *svc)
>
>      printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight);
>
> +    if ( has_cap(svc) )
> +        printk(" budget=%"PRI_stime, svc->budget);
> +
>      printk(" load=%"PRI_stime" (~%"PRI_stime"%%)", svc->avgload,
>             (svc->avgload * 100) >> prv->load_precision_shift);
>
> @@ -2856,9 +3246,10 @@ csched2_dump(const struct scheduler *ops)
>
>          sdom = list_entry(iter_sdom, struct csched2_dom, sdom_elem);
>
> -        printk("\tDomain: %d w %d v %d\n",
> +        printk("\tDomain: %d w %d c %u v %d\n",
>                 sdom->dom->domain_id,
>                 sdom->weight,
> +               sdom->cap,
>                 sdom->nr_vcpus);
>
>          for_each_vcpu( sdom->dom, v )
> @@ -3076,12 +3467,14 @@ csched2_init(struct scheduler *ops)
>             XENLOG_INFO " load_window_shift: %d\n"
>             XENLOG_INFO " underload_balance_tolerance: %d\n"
>             XENLOG_INFO " overload_balance_tolerance: %d\n"
> -           XENLOG_INFO " runqueues arrangement: %s\n",
> +           XENLOG_INFO " runqueues arrangement: %s\n"
> +           XENLOG_INFO " cap enforcement granularity: %dms\n",
>             opt_load_precision_shift,
>             opt_load_window_shift,
>             opt_underload_balance_tolerance,
>             opt_overload_balance_tolerance,
> -           opt_runqueue_str[opt_runqueue]);
> +           opt_runqueue_str[opt_runqueue],
> +           opt_cap_period);
>
>      if ( opt_load_precision_shift < LOADAVG_PRECISION_SHIFT_MIN )
>      {
> @@ -3099,6 +3492,13 @@ csched2_init(struct scheduler *ops)
>      printk(XENLOG_INFO "load tracking window length %llu ns\n",
>             1ULL << opt_load_window_shift);
>
> +    if ( CSCHED2_BDGT_REPL_PERIOD < CSCHED2_MIN_TIMER )
> +    {
> +        printk("WARNING: %s: opt_cap_period %d too small, resetting\n",
> +               __func__, opt_cap_period);
> +        opt_cap_period = 10; /* ms */
> +    }
> +
>      /* Basically no CPU information is available at this point; just
>       * set up basic structures, and a callback when the CPU info is
>       * available. */
> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
> index 1127ca9..2c7f9cc 100644
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -787,6 +787,9 @@ static inline struct domain *next_domain_in_cpupool(
>   /* VCPU is being reset. */
>  #define _VPF_in_reset        7
>  #define VPF_in_reset         (1UL<<_VPF_in_reset)
> +/* VCPU is parked. */
> +#define _VPF_parked          8
> +#define VPF_parked           (1UL<<_VPF_parked)
>
>  static inline int vcpu_runnable(struct vcpu *v)
>  {
>
Dario Faggioli June 12, 2017, 1:19 p.m. UTC | #2
Hey,

Thanks for looking at the patch! :-)

On Mon, 2017-06-12 at 12:16 +0100, Anshul Makkar wrote:
> On 08/06/2017 13:08, Dario Faggioli wrote:
> > This commit implements the Xen part of the cap mechanism for
> > Credit2.
> > 
> > A cap is how much, in terms of % of physical CPU time, a domain
> > can execute at most.
> > 
> > For instance, a domain that must not use more than 1/4 of one
> > physical CPU, must have a cap of 25%; one that must not use more
> > than 1+1/2 of physical CPU time, must be given a cap of 150%.
> > 
> > Caps are per domain, so it is all a domain's vCPUs, cumulatively,
> > that will be forced to execute no more than the decided amount.
> > 
> > This is implemented by giving each domain a 'budget', and using
> > a (per-domain again) periodic timer. Values of budget and 'period'
> > are chosen so that budget/period is equal to the cap itself.
> > 
> > Budget is burned by the domain's vCPUs, in a similar way to how
> > credits are.
> > 
> > When a domain runs out of budget, its vCPUs can't run any longer.
> 
> if the vcpus of a domain have credit and if budget has run out, will
> the 
> vcpus won't be scheduled.
>
Is this a question? Assuming it is, what do you mean with "domain have
credit"? Domains always have credits, and they never run out of them.
There's no such thing as a domain not being runnable because it does
not have credits.

About budget, a domain with <= 0 budget means all its vcpus are not
runnable, and hence won't be scheduler, no matter their credits.

> > @@ -92,6 +92,82 @@
> >   */
> > 
> >  /*
> > + * Utilization cap:
> > + *
> > + * Setting an pCPU utilization cap for a domain means the
> > following:
> > + *
> > + * - a domain can have a cap, expressed in terms of % of physical
> > CPU time.
> > + *   A domain that must not use more than 1/4 of _one_ physical
> > CPU, will
> > + *   be given a cap of 25%; a domain that must not use more than
> > 1+1/2 of
> > + *   physical CPU time, will be given a cap of 150%;
> > + *
> > + * - caps are per-domain (not per-vCPU). If a domain has only 1
> > vCPU, and
> > + *   a 40% cap, that one vCPU will use 40% of one pCPU. If a
> > somain has 4
> > + *   vCPUs, and a 200% cap, all its 4 vCPUs are allowed to run for
> > (the
> > + *   equivalent of) 100% time on 2 pCPUs. How much each of the
> > various 4
> > + *   vCPUs will get, is unspecified (will depend on various
> > aspects: workload,
> > + *   system load, etc.).
> 
> or the ratio can vary eg. 4 vcpus are allowed to run got 50% of the
> time 
> if on 4 pcpus ?
>
Yes, this is just an example. From the cap mechanism point of view,
running for 4x50% is exactly the same as of running fo 2x100%, and
there is no way to control what the actual distribution of runtime will
be, on a multi-vcpu guest.

I can try to make the wording a bit clearer... I can certainly add more
examples (but I have no chance being exhaustive, as the combinations
are virtually infinite!  :-P).

> > + *
> > + * For implementing this, we use the following approach:
> > + *
> > + * - each domain is given a 'budget', an each domain has a timer,
> > which
> > + *   replenishes the domain's budget periodically. The budget is
> > the amount
> > + *   of time the vCPUs of the domain can use every 'period';
> > + *
> > + * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for
> > all domains
> > + *   (but each domain has its own timer; so the all are periodic
> > by the same
> > + *   period, but replenishment of the budgets of the various
> > domains, at
> > + *   periods boundaries, are not synchronous);
> > + *
> > + * - when vCPUs run, they consume budget. When they don't run,
> > they don't
> > + *   consume budget. If there is no budget left for the domain, no
> > vCPU of
> > + *   that domain can run. If a vCPU tries to run and finds that
> > there is no
> > + *   budget, it blocks.
> > + *   Budget never expires, so at whatever time a vCPU wants to
> > run, it can
> > + *   check the domain's budget, and if there is some, it can use
> > it.
> > + *
> > + * - budget is replenished to the top of the capacity for the
> > domain once
> > + *   per period. Even if there was some leftover budget from
> > previous period,
> > + *   though, the budget after a replenishment will always be at
> > most equal
> > + *   to the total capacify of the domain ('tot_budget');
> > + *
> 
> budget is replenished but credits not available ?
>
Still not getting this.

> budget is finished but not vcpu has not reached the rate limit
> boundary ?
> 
Budget takes precedence over ratelimiting. This is important to keep
cap working "regularly", rather then in some kind of permanent "trying-
to-keep-up-with-overruns-in-previous-periods" state.

And, ideally, a vcpu cap and ratelimiting should be set in such a way
that they don't step on each other toe (or do that only rarely). I can
see about trying to print a warning when I detect potential tricky
values (but it's not easy, considering budget is per-domain, so I can't
be sure about how much each vcpu will actually get, and whether or not
that will reveal to be significantly less than ratelimiting the most of
the times).

> > + * - when a budget replenishment occurs, if there are vCPUs that
> > had been
> > + *   blocked because of lack of budget, they'll be unblocked, and
> > they will
> > + *   (potentially) be able to run again.
> > + *
> > + * Finally, some even more implementation related detail:
> > + *
> > + * - budget is stored in a domain-wide pool. vCPUs of the domain
> > that want
> > + *   to run go to such pool, and grub some. When they do so, the
> > amount
> > + *   they grabbed is _immediately_ removed from the pool. This
> > happens in
> > + *   vcpu_try_to_get_budget();
> > + *
> > + * - when vCPUs stop running, if they've not consumed all the
> > budget they
> > + *   took, the leftover is put back in the pool. This happens in
> > + *   vcpu_give_budget_back();
> 
> 200% budget, 4 vcpus to run on 4 pcpus each allowed only 50% of
> budget. 
> This is a static allocation .
>
Err... again, are you telling or asking?

>  for eg. 2 vcpus running on 2 pvpus at 20% 
> budgeted time, if vcpu3 wants to execute some cpu intensive task,
> then 
> it won't be allowed to allowed to use more than 50% of the pcpus.
> 
With what parameters? You mean with these ones you cite here (i.e., a
200% budget)? If the VM has 200%, and vcpu1 and vcpu2 consumes
20%+20%=40%, there's 160% free for vcpu3 and vcpu4.

> I checked the implenation below and I believe we can allow for these 
> type of dynamic budget_quota allocation per vcpu. Not for initial 
> version, but certainly we can consider it for future versions.
>
But... it's already totally dynamic.

> > @@ -408,6 +505,10 @@ struct csched2_vcpu {
> >      unsigned int residual;
> > 
> >      int credit;
> > +
> > +    s_time_t budget;
> 
> it's confusing, please can we have different member names for budget
> in 
> domain and vcpu structure.
>
Mmm.. I don't think it is. It's "how much budget this _thing_ have",
where "thing" can be the domain or a vcpu, and you can tell that by
looking at the containing structure. Most of the time, that's rather
explicit, the former being sdom->budget, the latter being svc->budget.

What different names did you have in mind?

The only alternative that I can come up with would be something like:

struct csched2_dom {
 ...
 dom_budget;
 ...
};
struct csched2_vcpu {
 ...
 vcpu_budget;
 ...
};

Which I don't like (because of the repetition).

> > @@ -1354,7 +1469,16 @@ static void reset_credit(const struct
> > scheduler *ops, int cpu, s_time_t now,
> >           * that the credit it has spent so far get accounted.
> >           */
> >          if ( svc->vcpu == curr_on_cpu(svc_cpu) )
> > +        {
> >              burn_credits(rqd, svc, now);
> > +            /*
> > +             * And, similarly, in case it has run out of budget,
> > as a
> > +             * consequence of this round of accounting, we also
> > must inform
> > +             * its pCPU that it's time to park it, and pick up
> > someone else.
> > +             */
> > +            if ( unlikely(svc->budget <= 0) )
> 
> use of unlikely here is not saving much of cpu cycles.
>
Well, considering that not all domains will have a cap, and that we
don't expect, even for the domains with caps, all their vcpus to
exhaust their budget at every reset event, I think annotating this as
an unlikely event makes sense.

It's not a super big deal, though, and I can get rid of it, if people
don't like/are not convinced about it. :-)

> > @@ -1410,27 +1534,35 @@ void burn_credits(struct
> > csched2_runqueue_data *rqd,
> > 
> >      delta = now - svc->start_time;
> > 
> > -    if ( likely(delta > 0) )
> > -    {
> > -        SCHED_STAT_CRANK(burn_credits_t2c);
> > -        t2c_update(rqd, delta, svc);
> > -        svc->start_time = now;
> > -    }
> > -    else if ( delta < 0 )
> > +    if ( unlikely(delta <= 0) )
> >      {
> > -        d2printk("WARNING: %s: Time went backwards? now
> > %"PRI_stime" start_time %"PRI_stime"\n",
> > -                 __func__, now, svc->start_time);
> > +        if ( unlikely(delta < 0) )
> > +            d2printk("WARNING: %s: Time went backwards? now
> > %"PRI_stime
> > +                     " start_time %"PRI_stime"\n", __func__, now,
> > +                     svc->start_time);
> > +        goto out;
> >      }
> > 
> > +    SCHED_STAT_CRANK(burn_credits_t2c);
> > +    t2c_update(rqd, delta, svc);
> > +
> > +    if ( unlikely(svc->budget != STIME_MAX) )
> 
> not clear, what is this check about ?
>
Ah, good catch. This should have been has_cap()! I'll fix it.

> > @@ -1438,6 +1570,217 @@ void burn_credits(struct
> > 
> > +static bool vcpu_try_to_get_budget(struct csched2_vcpu *svc)
> > +{
> > +    struct csched2_dom *sdom = svc->sdom;
> > +    unsigned int cpu = svc->vcpu->processor;
> > +
> > +    ASSERT(spin_is_locked(per_cpu(schedule_data,
> > cpu).schedule_lock));
> > +
> > +    if ( svc->budget > 0 )
> > +        return true;
> > +
> > +    /* budget_lock nests inside runqueue lock. */
> > +    spin_lock(&sdom->budget_lock);
> > +
> > +    /*
> > +     * Here, svc->budget is <= 0 (as, if it was > 0, we'd have
> > taken the if
> > +     * above!). That basically means the vCPU has overrun a bit --
> > because of
> > +     * various reasons-- and we want to take that into account.
> > With the +=,
> > +     * we are actually subtracting the amount of budget the vCPU
> > has
> > +     * overconsumed, from the total domain budget.
> > +     */
> > +    sdom->budget += svc->budget;
> > +
> > +    if ( sdom->budget > 0 )
> > +    {
> > +        svc->budget = sdom->budget;
> 
> why are you assigning the remaining sdom->budge to only this svc.
> svc 
> should be assigned a proportionate budget.
> Each vcpu is assigned a %age of the domain budget based on the cap
> and 
> number of vcpus.
> There is difference in the code that's here and the code in branch 
> git://xenbits.xen.org/people/dariof/xen.git (fetch) 
> rel/sched/credti2-caps branch. Logic in the branch code looks fine
> where 
> you are taking svc->budget_quota into considration..
>
Yeah... maybe look at patch 3/4. :-P

> > @@ -2423,14 +2785,22 @@ csched2_runtime(const struct scheduler
> > *ops, int cpu,
> >       * credit values of MIN,MAX per vcpu, since each vcpu burns
> > credit
> >       * at a different rate.
> >       */
> > -    if (rt_credit > 0)
> > +    if ( rt_credit > 0 )
> >          time = c2t(rqd, rt_credit, snext);
> >      else
> >          time = 0;
> > 
> > -    /* 3) But never run longer than MAX_TIMER or less than
> > MIN_TIMER or
> > -     * the rate_limit time. */
> > -    if ( time < min_time)
> > +    /*
> > +     * 3) But, if capped, never run more than our budget.
> > +     */
> > +    if ( unlikely(has_cap(snext)) )
> > +        time = snext->budget < time ? snext->budget : time;
> > +
> 
> does the budget takes precedence over rate and credit ?
>
It does takes precedence over ratelimiting, yes. There's no precedence
relationship between budget and credits, and, anyway, the code here has
nothing to do with that.

In fact, all this code is saying is that, if this vcpu has 748us of
budget left, I want csched2_schedule() to be called again no farther
than 748us from now.

> please replace snext->budget with something which is less confusing
> eg 
> snext->budget_allocated..
>
How would budget_allocated be less confusing?

> > @@ -2544,11 +2914,13 @@ runq_candidate(struct csched2_runqueue_data
> > *rqd,
> >          }
> > 
> 
> In runq candidate we have a code base
> /*
>   * Return the current vcpu if it has executed for less than
> ratelimit.
>   * Adjuststment for the selected vcpu's credit and decision
>   * for how long it will run will be taken in csched2_runtime.
>   *
>   * Note that, if scurr is yielding, we don't let rate limiting kick
> in.
>   * In fact, it may be the case that scurr is about to spin, and
> there's
>   * no point forcing it to do so until rate limiting expires.
>   */
>   if ( !yield && prv->ratelimit_us && !is_idle_vcpu(scurr->vcpu) &&
>        vcpu_runnable(scurr->vcpu) &&
>       (now - scurr->vcpu->runstate.state_entry_time) <
>         MICROSECS(prv->ratelimit_us) )
> In this codeblock we return scurr. Here there is no check for vcpu-
> >budget.
> Even if the scurr vcpu has executed for less than rate limit and
> scurr 
> is not yielding, we need to check for its budget before returning
> scurr.
> 
But we check vcpu_runnable(scurr). And we've already called, in
csched2_schedule(), vcpu_try_to_get_budget(scurr). And if scurr could
not get any budget, we called park_vcpu(scurr), which sets scurr up in
such a way that vcpu_runnable(scurr) is false.

Thanks and Regards,
Dario
Anshul Makkar June 13, 2017, 4:07 p.m. UTC | #3
On 12/06/2017 14:19, Dario Faggioli wrote:
> Hey,

>>> Budget is burned by the domain's vCPUs, in a similar way to how
>>> credits are.
>>>
>>> When a domain runs out of budget, its vCPUs can't run any longer.
>>
>> if the vcpus of a domain have credit and if budget has run out, will
>> the
>> vcpus won't be scheduled.
>>
> Is this a question? Assuming it is, what do you mean with "domain have
> credit"? Domains always have credits, and they never run out of them.
> There's no such thing as a domain not being runnable because it does
> not have credits.
>
"domain have credit" ? the statement is if the "vcpus of domain have 
credit".

> About budget, a domain with <= 0 budget means all its vcpus are not
> runnable, and hence won't be scheduler, no matter their credits.
You answered the question here.
>
>>> @@ -92,6 +92,82 @@
>>>   */
>>>
>>>  /*
>>> + * Utilization cap:
>>> + *
>>> + * Setting an pCPU utilization cap for a domain means the
>>> following:
>>> + *
>>> + * - a domain can have a cap, expressed in terms of % of physical
>>> + * For implementing this, we use the following approach:
>>> + *
>>> + * - each domain is given a 'budget', an each domain has a timer,
>>> which
>>> + *   replenishes the domain's budget periodically. The budget is
>>> the amount
>>> + *   of time the vCPUs of the domain can use every 'period';
>>> + *
>>> + * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for
>>> all domains
>>> + *   (but each domain has its own timer; so the all are periodic
>>> by the same
>>> + *   period, but replenishment of the budgets of the various
>>> domains, at
>>> + *   periods boundaries, are not synchronous);
>>> + *
>>> + * - when vCPUs run, they consume budget. When they don't run,
>>> they don't
>>> + *   consume budget. If there is no budget left for the domain, no
>>> vCPU of
>>> + *   that domain can run. If a vCPU tries to run and finds that
>>> there is no
>>> + *   budget, it blocks.
>>> + *   Budget never expires, so at whatever time a vCPU wants to
>>> run, it can
>>> + *   check the domain's budget, and if there is some, it can use
>>> it.
>>> + *
>>> + * - budget is replenished to the top of the capacity for the
>>> domain once
>>> + *   per period. Even if there was some leftover budget from
>>> previous period,
>>> + *   though, the budget after a replenishment will always be at
>>> most equal
>>> + *   to the total capacify of the domain ('tot_budget');
>>> + *
>>
>> budget is replenished but credits not available ?
>>
> Still not getting this.
what I want to ask is that if the budget of the domain is replenished, 
but credit for the vcpus of that domain is not available, then what 
happens.
I believe, vcpus won't be scheduled (even if they have budget_quota) 
till they get their credit replenished.
>
>> budget is finished but not vcpu has not reached the rate limit
>> boundary ?
>>
> Budget takes precedence over ratelimiting. This is important to keep
> cap working "regularly", rather then in some kind of permanent "trying-
> to-keep-up-with-overruns-in-previous-periods" state.
>
> And, ideally, a vcpu cap and ratelimiting should be set in such a way
> that they don't step on each other toe (or do that only rarely). I can
> see about trying to print a warning when I detect potential tricky
> values (but it's not easy, considering budget is per-domain, so I can't
> be sure about how much each vcpu will actually get, and whether or not
why you can't be sure. Scheduler know the domain budget, number of vcpus 
per domain and we can calculate the budget_quota and translate it into 
cpu slot duration.
Similarly , the value of rate limit is also known. We can compare and 
give a warning to the user if the budget_quota is less than rate limit.

This is very important for the user to know, if wrongly chosen, it can 
adversely affect the system's performance with frequent context 
switches. (the problem we are aware of).

> that will reveal to be significantly less than ratelimiting the most of
> the times).
>
>>> + * - when a budget replenishment occurs, if there are vCPUs that
>>> had been
>>> + *   blocked because of lack of budget, they'll be unblocked, and
>>> they will
>>> + *   (potentially) be able to run again.
>>> + *
>>> + * Finally, some even more implementation related detail:
>>> + *
>>> + * - budget is stored in a domain-wide pool. vCPUs of the domain
>>> that want
>>> + *   to run go to such pool, and grub some. When they do so, the
>>> amount
>>> + *   they grabbed is _immediately_ removed from the pool. This
>>> happens in
>>> + *   vcpu_try_to_get_budget();
>>> + *
>>> + * - when vCPUs stop running, if they've not consumed all the
>>> budget they
>>> + *   took, the leftover is put back in the pool. This happens in
>>> + *   vcpu_give_budget_back();
>>
>> 200% budget, 4 vcpus to run on 4 pcpus each allowed only 50% of
>> budget.
>> This is a static allocation .
>>
> Err... again, are you telling or asking?
giving an example to prove its a static allocation.
>
>>  for eg. 2 vcpus running on 2 pvpus at 20%
>> budgeted time, if vcpu3 wants to execute some cpu intensive task,
>> then
>> it won't be allowed to allowed to use more than 50% of the pcpus.
>>
> With what parameters? You mean with these ones you cite here (i.e., a
> 200% budget)? If the VM has 200%, and vcpu1 and vcpu2 consumes
> 20%+20%=40%, there's 160% free for vcpu3 and vcpu4.
>
>> I checked the implenation below and I believe we can allow for these
>> type of dynamic budget_quota allocation per vcpu. Not for initial
>> version, but certainly we can consider it for future versions.
>>
> But... it's already totally dynamic.
csched2_dom_cntl()
{
svc->budget_quota = max(sdom->tot_budget / sdom->nr_vcpus,
                                         CSCHED2_MIN_TIMER);
}
If domain->tot_budge = 200
nr_cpus is 4, then each cpu gets 50%.
How this is dynamic allocation ? We are not considering vcpu utilization 
of other vcpus of domain before allocating budget_quota for some vcpu.

Let me know if my understanding is wrong.
>
>>> @@ -408,6 +505,10 @@ struct csched2_vcpu {
>>>      unsigned int residual;
>>>
>>>      int credit;
>>> +
>>> +    s_time_t budget;
>>
>> it's confusing, please can we have different member names for budget
>> in
>> domain and vcpu structure.
>>
> Mmm.. I don't think it is. It's "how much budget this _thing_ have",
> where "thing" can be the domain or a vcpu, and you can tell that by
> looking at the containing structure. Most of the time, that's rather
> explicit, the former being sdom->budget, the latter being svc->budget.
>
> What different names did you have in mind?
>
> The only alternative that I can come up with would be something like:
>
> struct csched2_dom {
>  ...
>  dom_budget;
>  ...
> };
> struct csched2_vcpu {
>  ...
>  vcpu_budget;
>  ...
> };
>
> Which I don't like (because of the repetition).
>
Just shared a thought as I experienced the confusion while I was reading 
the code for the first time. If you don't agree, its fine.
>>> @@ -1354,7 +1469,16 @@ static void reset_credit(const struct
>>> scheduler *ops, int cpu, s_time_t now,
>>>           * that the credit it has spent so far get accounted.
>>>           */
>>>          if ( svc->vcpu == curr_on_cpu(svc_cpu) )
>>> +        {
>>>              burn_credits(rqd, svc, now);
>>> +            /*
>>> +             * And, similarly, in case it has run out of budget,
>>> as a
>>> +             * consequence of this round of accounting, we also
>>> must inform
>>> +             * its pCPU that it's time to park it, and pick up
>>> someone else.
>>> +             */
>>> +            if ( unlikely(svc->budget <= 0) )
>>
>> use of unlikely here is not saving much of cpu cycles.
>>
> Well, considering that not all domains will have a cap, and that we
> don't expect, even for the domains with caps, all their vcpus to
> exhaust their budget at every reset event, I think annotating this as
> an unlikely event makes sense.
 From what I understand, I considered it to be a very likely event.

>
> It's not a super big deal, though, and I can get rid of it, if people
> don't like/are not convinced about it. :-)
yes, its fine, we can leave it for now.

>
>>> @@ -1410,27 +1534,35 @@ void burn_credits(struct
>>> +    sdom->budget += svc->budget;
>>> +
>>> +    if ( sdom->budget > 0 )
>>> +    {
>>> +        svc->budget = sdom->budget;
>>
>> why are you assigning the remaining sdom->budge to only this svc.
>> svc
>> should be assigned a proportionate budget.
>> Each vcpu is assigned a %age of the domain budget based on the cap
>> and
>> number of vcpus.
>> There is difference in the code that's here and the code in branch
>> git://xenbits.xen.org/people/dariof/xen.git (fetch)
>> rel/sched/credti2-caps branch. Logic in the branch code looks fine
>> where
>> you are taking svc->budget_quota into considration..
>>
> Yeah... maybe look at patch 3/4. :-P
Yeah, got it in third patch. :)
>
>> In runq candidate we have a code base
>> /*
>>   * Return the current vcpu if it has executed for less than
>> ratelimit.
>>   * Adjuststment for the selected vcpu's credit and decision
>>   * for how long it will run will be taken in csched2_runtime.
>>   *
>>   * Note that, if scurr is yielding, we don't let rate limiting kick
>> in.
>>   * In fact, it may be the case that scurr is about to spin, and
>> there's
>>   * no point forcing it to do so until rate limiting expires.
>>   */
>>   if ( !yield && prv->ratelimit_us && !is_idle_vcpu(scurr->vcpu) &&
>>        vcpu_runnable(scurr->vcpu) &&
>>       (now - scurr->vcpu->runstate.state_entry_time) <
>>         MICROSECS(prv->ratelimit_us) )
>> In this codeblock we return scurr. Here there is no check for vcpu-
>>> budget.
>> Even if the scurr vcpu has executed for less than rate limit and
>> scurr
>> is not yielding, we need to check for its budget before returning
>> scurr.
>>
> But we check vcpu_runnable(scurr). And we've already called, in
> csched2_schedule(), vcpu_try_to_get_budget(scurr). And if scurr could
> not get any budget, we called park_vcpu(scurr), which sets scurr up in
> such a way that vcpu_runnable(scurr) is false.
Yes, got your point, but then the call for vcpu_try_to_get_budet should 
move to the code block in runq_candidate that return scurr other wise 
the condition looks incomplete and makes the logic ambiguous.

We call runq_candidate to find the next runnable candidate. If we want 
to return scurr as the current runnable candidate then it should have 
gone through all the checks including budget_quota and all these checks 
should be at one place.
>
> Thanks and Regards,
> Dario
>

Anshul
Dario Faggioli June 13, 2017, 9:13 p.m. UTC | #4
On Tue, 2017-06-13 at 17:07 +0100, Anshul Makkar wrote:
> On 12/06/2017 14:19, Dario Faggioli wrote:
> > > > @@ -92,6 +92,82 @@
> > > >   */
> > > > 
> > > >  /*
> > > > + * Utilization cap:
> > > > + *
> > > > + * Setting an pCPU utilization cap for a domain means the
> > > > following:
> > > > + *
> > > > + * - a domain can have a cap, expressed in terms of % of
> > > > physical
> > > > + * For implementing this, we use the following approach:
> > > > + *
> > > > + * - each domain is given a 'budget', an each domain has a
> > > > timer,
> > > > which
> > > > + *   replenishes the domain's budget periodically. The budget
> > > > is
> > > > the amount
> > > > + *   of time the vCPUs of the domain can use every 'period';
> > > > + *
> > > > + * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same
> > > > for
> > > > all domains
> > > > + *   (but each domain has its own timer; so the all are
> > > > periodic
> > > > by the same
> > > > + *   period, but replenishment of the budgets of the various
> > > > domains, at
> > > > + *   periods boundaries, are not synchronous);
> > > > + *
> > > > + * - when vCPUs run, they consume budget. When they don't run,
> > > > they don't
> > > > + *   consume budget. If there is no budget left for the
> > > > domain, no
> > > > vCPU of
> > > > + *   that domain can run. If a vCPU tries to run and finds
> > > > that
> > > > there is no
> > > > + *   budget, it blocks.
> > > > + *   Budget never expires, so at whatever time a vCPU wants to
> > > > run, it can
> > > > + *   check the domain's budget, and if there is some, it can
> > > > use
> > > > it.
> > > > + *
> > > > + * - budget is replenished to the top of the capacity for the
> > > > domain once
> > > > + *   per period. Even if there was some leftover budget from
> > > > previous period,
> > > > + *   though, the budget after a replenishment will always be
> > > > at
> > > > most equal
> > > > + *   to the total capacify of the domain ('tot_budget');
> > > > + *
> > > 
> > > budget is replenished but credits not available ?
> > > 
> > 
> > Still not getting this.
> 
> what I want to ask is that if the budget of the domain is
> replenished, 
> but credit for the vcpus of that domain is not available, then what 
> happens.
>
Yes, but the point is that budget can be available or not, while
credits are always available. There's no such thing as credit not being
available at all.

The amount of credits each vcpu has decides which vcpu will run, in the
sense that it will be the one that has the highest amount of credits.
The others will indeed wait, but because they've got less credit than
the one that runs, not because they don't have credits available.

> I believe, vcpus won't be scheduled (even if they have budget_quota) 
> till they get their credit replenished.
>
Credits are not exhausted or replenished.

If you want to know what happens when there are two vcpus, but with
budget, and a different amount of credits (and only 1 pcpu where to run
them), that is: the one with more credits runs.

> > 
> > > budget is finished but not vcpu has not reached the rate limit
> > > boundary ?
> > > 
> > 
> > Budget takes precedence over ratelimiting. This is important to
> > keep
> > cap working "regularly", rather then in some kind of permanent
> > "trying-
> > to-keep-up-with-overruns-in-previous-periods" state.
> > 
> > And, ideally, a vcpu cap and ratelimiting should be set in such a
> > way
> > that they don't step on each other toe (or do that only rarely). I
> > can
> > see about trying to print a warning when I detect potential tricky
> > values (but it's not easy, considering budget is per-domain, so I
> > can't
> > be sure about how much each vcpu will actually get, and whether or
> > not
> 
> why you can't be sure. Scheduler know the domain budget, number of
> vcpus 
> per domain and we can calculate the budget_quota and translate it
> into 
> cpu slot duration.
>
Sure. So, let's say you give a domain 200%, which means 200ms of budget
every 100ms. It has 4 vcpus, which means each vcpu will get 50ms.

At time t, vcpu1 starts running, executes for 10ms, and then stops.
Still at time t, all the other three vcpus (vcpu2, vcpu3 and vcpu4)
starts running; they run for 50ms, which means they exhaust the quota
you assigned to them, but they would like to continue to run?
What do you do?
There's still 40ms worth of budget available, for this period, in the
domain.

If you don't let (any of) them run, and use that budget, then you're
limiting the domain to 160%.

If you do let (maybe some of) them run, then they are using more than
the quota you calculated for each of them, which is fine, from the cap
point of view (and, in fact, it's what happens with this series), but
means that you can't assume to know for sure what quota of budget each
vcpu will actually use, and hence you can't...

> Similarly , the value of rate limit is also known. We can compare
> and 
> give a warning to the user if the budget_quota is less than rate
> limit.
> 
...compare that with the ratelimit value (or at least, you can sort of
guess and try to come up with a sensible warning, but you can't be
sure).

> This is very important for the user to know, if wrongly chosen, it
> can 
> adversely affect the system's performance with frequent context 
> switches. (the problem we are aware of).
> 
I know. I'll think at how to better prevent (or warn if seeing) too
small values, but there's no such thing as crystal balls or magic wands
:-(

> > > I checked the implenation below and I believe we can allow for
> > > these
> > > type of dynamic budget_quota allocation per vcpu. Not for initial
> > > version, but certainly we can consider it for future versions.
> > > 
> > 
> > But... it's already totally dynamic.
> 
> csched2_dom_cntl()
> {
> svc->budget_quota = max(sdom->tot_budget / sdom->nr_vcpus,
>                                          CSCHED2_MIN_TIMER);
> }
> If domain->tot_budge = 200
> nr_cpus is 4, then each cpu gets 50%.
> How this is dynamic allocation ? We are not considering vcpu
> utilization 
> of other vcpus of domain before allocating budget_quota for some
> vcpu.
> 
Right. Well, what this means is that each vcpu will get budget in
chunks of tot_budget/nr_vcpus. But then, how much budget each vcpu will
actually be able to get and consume in each period, it's impossible to
know in advance, as it will depend on overall system load, and the
behavior of the various vcpus of the domain.

> > > In runq candidate we have a code base
> > > /*
> > >   * Return the current vcpu if it has executed for less than
> > > ratelimit.
> > >   * Adjuststment for the selected vcpu's credit and decision
> > >   * for how long it will run will be taken in csched2_runtime.
> > >   *
> > >   * Note that, if scurr is yielding, we don't let rate limiting
> > > kick
> > > in.
> > >   * In fact, it may be the case that scurr is about to spin, and
> > > there's
> > >   * no point forcing it to do so until rate limiting expires.
> > >   */
> > >   if ( !yield && prv->ratelimit_us && !is_idle_vcpu(scurr->vcpu)
> > > &&
> > >        vcpu_runnable(scurr->vcpu) &&
> > >       (now - scurr->vcpu->runstate.state_entry_time) <
> > >         MICROSECS(prv->ratelimit_us) )
> > > In this codeblock we return scurr. Here there is no check for
> > > vcpu-
> > > > budget.
> > > 
> > > Even if the scurr vcpu has executed for less than rate limit and
> > > scurr
> > > is not yielding, we need to check for its budget before returning
> > > scurr.
> > > 
> > 
> > But we check vcpu_runnable(scurr). And we've already called, in
> > csched2_schedule(), vcpu_try_to_get_budget(scurr). And if scurr
> > could
> > not get any budget, we called park_vcpu(scurr), which sets scurr up
> > in
> > such a way that vcpu_runnable(scurr) is false.
> 
> Yes, got your point, but then the call for vcpu_try_to_get_budet
> should 
> move to the code block in runq_candidate that return scurr other
> wise 
> the condition looks incomplete and makes the logic ambiguous.
> 
I don't think so. I've used a new pause flag for parking vcpus
_exactly_ for taking advantage of the fact that vcpu_runnable() will
then do the right thing automatically, and I wouldn't have to spread
budget checks all around the code.

For instance, something similar happens in context_saved(). There it's
the opposite, i.e., if a vcpu had been parked, but a replenishment
arrived, clearing the _VPF_parked flag, then the vcpu_runnable() check
already present in context_save() will do the right thing and add the
vcpu back in the runqueue.

It's a distinctive characteristic of this implementation, as opposed,
for instance, to Credit1 one, which use vcpu_pause() and vcpu_unpause()
for the same purpose (which is something I totally dislike), and I
don't see why not take advantage of it.

> We call runq_candidate to find the next runnable candidate. If we
> want 
> to return scurr as the current runnable candidate then it should
> have 
> gone through all the checks including budget_quota and all these
> checks 
> should be at one place.
>
Exactly! And in fact, they all are exactly there, being taken care of
by vcpu_runnable() (in the same exact way as it takes care of checking
whether the vcpu has blocked on some I/O, or has been explicitly
paused, or ...).

Regards,
Dario
Anshul Makkar June 15, 2017, 4:16 p.m. UTC | #5
On 13/06/2017 22:13, Dario Faggioli wrote:
> On Tue, 2017-06-13 at 17:07 +0100, Anshul Makkar wrote:
>> On 12/06/2017 14:19, Dario Faggioli wrote:
>>>>> @@ -92,6 +92,82 @@
>>>>>   */
>>
>> what I want to ask is that if the budget of the domain is
>> replenished,
>> but credit for the vcpus of that domain is not available, then what
>> happens.
>>
> Yes, but the point is that budget can be available or not, while
> credits are always available. There's no such thing as credit not being
> available at all.
>
> The amount of credits each vcpu has decides which vcpu will run, in the
> sense that it will be the one that has the highest amount of credits.
> The others will indeed wait, but because they've got less credit than
> the one that runs, not because they don't have credits available.
>
Ok, as discussed, credits are resetted if it reaches 0. In that sense 
its being considered "always available"..

>> I believe, vcpus won't be scheduled (even if they have budget_quota)
>> till they get their credit replenished.
>>
> Credits are not exhausted or replenished.
>>>
>>> But... it's already totally dynamic.
>>
>> csched2_dom_cntl()
>> {
>> svc->budget_quota = max(sdom->tot_budget / sdom->nr_vcpus,
>>                                          CSCHED2_MIN_TIMER);
>> }
>> If domain->tot_budge = 200
>> nr_cpus is 4, then each cpu gets 50%.
>> How this is dynamic allocation ? We are not considering vcpu
>> utilization
>> of other vcpus of domain before allocating budget_quota for some
>> vcpu.
>>
> Right. Well, what this means is that each vcpu will get budget in
> chunks of tot_budget/nr_vcpus. But then, how much budget each vcpu will
> actually be able to get and consume in each period, it's impossible to
> know in advance, as it will depend on overall system load, and the
> behavior of the various vcpus of the domain.
Yeah, the current implementation is dynamic in the sense that vcpu can 
execute in total more than its quota across multiple budget cycles, but 
its static in the sense that vcpu can't execute more than its budget 
quota in a single budget cycle.s
>
>>>> In runq candidate we have a code base
>
> Regards,
> Dario
>
Reviewed-by: Anshul Makkar<anshul.makkar@citrix.com>
Anshul
George Dunlap June 22, 2017, 4:55 p.m. UTC | #6
On 08/06/17 13:08, Dario Faggioli wrote:
> This commit implements the Xen part of the cap mechanism for
> Credit2.
> 
> A cap is how much, in terms of % of physical CPU time, a domain
> can execute at most.
> 
> For instance, a domain that must not use more than 1/4 of one
> physical CPU, must have a cap of 25%; one that must not use more
> than 1+1/2 of physical CPU time, must be given a cap of 150%.
> 
> Caps are per domain, so it is all a domain's vCPUs, cumulatively,
> that will be forced to execute no more than the decided amount.
> 
> This is implemented by giving each domain a 'budget', and using
> a (per-domain again) periodic timer. Values of budget and 'period'
> are chosen so that budget/period is equal to the cap itself.
> 
> Budget is burned by the domain's vCPUs, in a similar way to how
> credits are.
> 
> When a domain runs out of budget, its vCPUs can't run any longer.
> They can gain, when the budget is replenishment by the timer, which
> event happens once every period.
> 
> Blocking the vCPUs because of lack of budget happens by
> means of a new (_VPF_parked) pause flag, so that, e.g.,
> vcpu_runnable() still works. This is similar to what is
> done in sched_rtds.c, as opposed to what happens in
> sched_credit.c, where vcpu_pause() and vcpu_unpause()
> (which means, among other things, more overhead).
> 
> Note that xenalyze and tools/xentrace/format are also modified,
> to keep them updated with one modified event.
> 
> Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>

Looks really good overall, Dario!  Just a few relatively minor comments.

> diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
> index 126417c..ba4bf4b 100644
> --- a/xen/common/sched_credit2.c
> +++ b/xen/common/sched_credit2.c
> @@ -92,6 +92,82 @@
>   */
>  
>  /*
> + * Utilization cap:
> + *
> + * Setting an pCPU utilization cap for a domain means the following:
> + *
> + * - a domain can have a cap, expressed in terms of % of physical CPU time.
> + *   A domain that must not use more than 1/4 of _one_ physical CPU, will
> + *   be given a cap of 25%; a domain that must not use more than 1+1/2 of
> + *   physical CPU time, will be given a cap of 150%;
> + *
> + * - caps are per-domain (not per-vCPU). If a domain has only 1 vCPU, and
> + *   a 40% cap, that one vCPU will use 40% of one pCPU. If a somain has 4
> + *   vCPUs, and a 200% cap, all its 4 vCPUs are allowed to run for (the
> + *   equivalent of) 100% time on 2 pCPUs. How much each of the various 4
> + *   vCPUs will get, is unspecified (will depend on various aspects: workload,
> + *   system load, etc.).
> + *
> + * For implementing this, we use the following approach:
> + *
> + * - each domain is given a 'budget', an each domain has a timer, which
> + *   replenishes the domain's budget periodically. The budget is the amount
> + *   of time the vCPUs of the domain can use every 'period';
> + *
> + * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for all domains
> + *   (but each domain has its own timer; so the all are periodic by the same
> + *   period, but replenishment of the budgets of the various domains, at
> + *   periods boundaries, are not synchronous);
> + *
> + * - when vCPUs run, they consume budget. When they don't run, they don't
> + *   consume budget. If there is no budget left for the domain, no vCPU of
> + *   that domain can run. If a vCPU tries to run and finds that there is no
> + *   budget, it blocks.
> + *   Budget never expires, so at whatever time a vCPU wants to run, it can
> + *   check the domain's budget, and if there is some, it can use it.

I'm not sure what this paragraph is trying to say. Saying budget "never
expires" makes it sound like you continue to accumulate it, such that if
you don't run at all for several periods, you could "save it up" and run
at 100% for one full period.

But that's contradicted by...

> + * - budget is replenished to the top of the capacity for the domain once
> + *   per period. Even if there was some leftover budget from previous period,
> + *   though, the budget after a replenishment will always be at most equal
> + *   to the total capacify of the domain ('tot_budget');

...this paragraph.

> + * - when a budget replenishment occurs, if there are vCPUs that had been
> + *   blocked because of lack of budget, they'll be unblocked, and they will
> + *   (potentially) be able to run again.
> + *
> + * Finally, some even more implementation related detail:
> + *
> + * - budget is stored in a domain-wide pool. vCPUs of the domain that want
> + *   to run go to such pool, and grub some. When they do so, the amount
> + *   they grabbed is _immediately_ removed from the pool. This happens in
> + *   vcpu_try_to_get_budget();

This sounds like a good solution to the "greedy vcpu" problem. :-)

> + * - when vCPUs stop running, if they've not consumed all the budget they
> + *   took, the leftover is put back in the pool. This happens in
> + *   vcpu_give_budget_back();
> + *
> + * - the above means that a vCPU can find out that there is no budget and
> + *   block, not only if the cap has actually been reached (for this period),
> + *   but also if some other vCPUs, in order to run, have grabbed a certain
> + *   quota of budget, no matter whether they've already used it all or not.
> + *   A vCPU blocking because (any form of) lack of budget is said to be
> + *   "parked", and such blocking happens in park_vcpu();
> + *
> + * - when a vCPU stops running, and puts back some budget in the domain pool,
> + *   we need to check whether there is someone which has been parked and that
> + *   can be unparked. This happens in unpark_parked_vcpus(), called from
> + *   csched2_context_saved();
> + *
> + * - of course, unparking happens also as a consequene of the domain's budget

*consequence

> @@ -293,6 +382,14 @@ static int __read_mostly opt_underload_balance_tolerance = 0;
>  integer_param("credit2_balance_under", opt_underload_balance_tolerance);
>  static int __read_mostly opt_overload_balance_tolerance = -3;
>  integer_param("credit2_balance_over", opt_overload_balance_tolerance);
> +/*
> + * Domains subject to a cap, receive a replenishment of their runtime budget

Unnecessary comma.


> @@ -1438,6 +1570,217 @@ void burn_credits(struct csched2_runqueue_data *rqd,
>      }
>  }
>  
> +/*
> + * Budget-related code.
> + */
> +
> +static void park_vcpu(struct csched2_vcpu *svc)
> +{
> +    struct vcpu *v = svc->vcpu;
> +
> +    ASSERT(spin_is_locked(&svc->sdom->budget_lock));
> +
> +    /*
> +     * It was impossible to find budget for this vCPU, so it has to be
> +     * "parked". This implies it is not runnable, so we mark it as such in
> +     * its pause_flags. If the vCPU is currently scheduled (which means we
> +     * are here after being called from within csched_schedule()), flagging
> +     * is enough, as we'll choose someone else, and then context_saved()
> +     * will take care of updating the load properly.
> +     *
> +     * If, OTOH, the vCPU is sitting in the runqueue (which means we are here
> +     * after being called from within runq_candidate()), we must go all the
> +     * way down to taking it out of there, and updating the load accordingly.
> +     *
> +     * In both cases, we also add it to the list of parked vCPUs of the domain.
> +     */
> +    __set_bit(_VPF_parked, &v->pause_flags);
> +    if ( vcpu_on_runq(svc) )
> +    {
> +        runq_remove(svc);
> +        update_load(svc->sdom->dom->cpupool->sched, svc->rqd, svc, -1, NOW());
> +    }
> +    list_add(&svc->parked_elem, &svc->sdom->parked_vcpus);
> +}
> +
> +static bool vcpu_try_to_get_budget(struct csched2_vcpu *svc)

This name is OK, but it's a bit long.  What about "vcpu_grab_budget()"?

> +{
> +    struct csched2_dom *sdom = svc->sdom;
> +    unsigned int cpu = svc->vcpu->processor;
> +
> +    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
> +
> +    if ( svc->budget > 0 )
> +        return true;
> +
> +    /* budget_lock nests inside runqueue lock. */
> +    spin_lock(&sdom->budget_lock);
> +
> +    /*
> +     * Here, svc->budget is <= 0 (as, if it was > 0, we'd have taken the if
> +     * above!). That basically means the vCPU has overrun a bit --because of
> +     * various reasons-- and we want to take that into account. With the +=,
> +     * we are actually subtracting the amount of budget the vCPU has
> +     * overconsumed, from the total domain budget.
> +     */
> +    sdom->budget += svc->budget;
> +
> +    if ( sdom->budget > 0 )
> +    {
> +        svc->budget = sdom->budget;
> +        sdom->budget = 0;

Just a minor comment here -- I didn't see anything in this description,
the cover letter, or this patch indicating that you were planning on
changing this in a follow-up patch.  A heads-up not to worry about
apparently allowing only one vcpu to run at a time might have been nice. :-)

> +    }
> +    else
> +    {
> +        svc->budget = 0;
> +        park_vcpu(svc);
> +    }
> +
> +    spin_unlock(&sdom->budget_lock);
> +
> +    return svc->budget > 0;
> +}
> +
> +static void
> +vcpu_give_budget_back(struct csched2_vcpu *svc, struct list_head *parked)

"vcpu_return_budget"?

> +{
> +    struct csched2_dom *sdom = svc->sdom;
> +    unsigned int cpu = svc->vcpu->processor;
> +
> +    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
> +    ASSERT(list_empty(parked));
> +
> +    /* budget_lock nests inside runqueue lock. */
> +    spin_lock(&sdom->budget_lock);
> +
> +    /*
> +     * The vCPU is stopping running (e.g., because it's blocking, or it has
> +     * been preempted). If it hasn't consumed all the budget it got when,
> +     * starting to run, put that remaining amount back in the domain's budget
> +     * pool.
> +     */
> +    sdom->budget += svc->budget;
> +    svc->budget = 0;
> +
> +    /*
> +     * Making budget available again to the domain means that parked vCPUs
> +     * may be unparked and run. They are, if any, in the domain's parked_vcpus
> +     * list, so we want to go through that and unpark them (so they can try
> +     * to get some budget).
> +     *
> +     * Touching the list requires the budget_lock, which we hold. Let's
> +     * therefore put everyone in that list in another, temporary list, which
> +     * then the caller will traverse, unparking the vCPUs it finds there.
> +     *
> +     * In fact, we can't do the actual unparking here, because that requires
> +     * taking the runqueue lock of the vCPUs being unparked, and we can't
> +     * take any runqueue locks while we hold a budget_lock.
> +     */
> +    if ( sdom->budget > 0 )
> +        list_splice_init(&sdom->parked_vcpus, parked);
> +
> +    spin_unlock(&sdom->budget_lock);
> +}
> +
> +static void
> +unpark_parked_vcpus(const struct scheduler *ops, struct list_head *vcpus)
> +{
> +    struct csched2_vcpu *svc, *tmp;
> +    spinlock_t *lock;
> +
> +    list_for_each_entry_safe(svc, tmp, vcpus, parked_elem)
> +    {
> +        unsigned long flags;
> +        s_time_t now;
> +
> +        lock = vcpu_schedule_lock_irqsave(svc->vcpu, &flags);
> +
> +        __clear_bit(_VPF_parked, &svc->vcpu->pause_flags);
> +        if ( unlikely(svc->flags & CSFLAG_scheduled) )
> +        {
> +            /*
> +             * We end here if a budget replenishment arrived between
> +             * csched2_schedule() (and, in particular, after a call to
> +             * vcpu_try_to_get_budget() that returned false), and
> +             * context_saved(). By setting __CSFLAG_delayed_runq_add,
> +             * we tell context_saved() to put the vCPU back in the
> +             * runqueue, from where it will compete with the others
> +             * for the newly replenished budget.
> +             */
> +            ASSERT( svc->rqd != NULL );
> +            ASSERT( c2rqd(ops, svc->vcpu->processor) == svc->rqd );
> +            __set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
> +        }
> +        else if ( vcpu_runnable(svc->vcpu) )
> +        {
> +            /*
> +             * The vCPU should go back to the runqueue, and compete for
> +             * the newly replenished budget, but only if it is actually
> +             * runnable (and was therefore offline only because of the
> +             * lack of budget).
> +             */
> +            now = NOW();
> +            update_load(ops, svc->rqd, svc, 1, now);
> +            runq_insert(ops, svc);
> +            runq_tickle(ops, svc, now);
> +        }
> +        list_del_init(&svc->parked_elem);
> +
> +        vcpu_schedule_unlock_irqrestore(lock, flags, svc->vcpu);
> +    }
> +}
> +
> +static void repl_sdom_budget(void* data)

Why not just "replenish_budget"?

> +{
> +    struct csched2_dom *sdom = data;
> +    unsigned long flags;
> +    s_time_t now;
> +    LIST_HEAD(parked);
> +
> +    spin_lock_irqsave(&sdom->budget_lock, flags);
> +
> +    /*
> +     * It is possible that the domain overrun, and that the budget hence went
> +     * below 0 (reasons may be system overbooking, issues in or too coarse
> +     * runtime accounting, etc.). In particular, if we overrun by more than
> +     * tot_budget, then budget+tot_budget would still be < 0, which in turn
> +     * means that, despite replenishment, there's still no budget for unarking
> +     * and running vCPUs.
> +     *
> +     * It is also possible that we are handling the replenishment much later
> +     * than expected (reasons may again be overbooking, or issues with timers).
> +     * If we are more than CSCHED2_BDGT_REPL_PERIOD late, this means we have
> +     * basically skipped (at least) one replenishment.
> +     *
> +     * We deal with both the issues here, by, basically, doing more than just
> +     * one replenishment. Note, however, that every time we add tot_budget
> +     * to the budget, we also move next_repl away by CSCHED2_BDGT_REPL_PERIOD.
> +     * This guarantees we always respect the cap.
> +     */
> +    now = NOW();
> +    do
> +    {
> +        sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD;
> +        sdom->budget += sdom->tot_budget;
> +    }
> +    while ( sdom->next_repl <= now || sdom->budget <= 0 );

The first clause ("oops, accidentally missed a replenishment period") I
agree with; but I'm going back and forth a bit on the second one.  It
means essentially that the scheduler made a mistake and allowed the VM
to run for one full budget *more* than its allocated time (perhaps
accumulated over several periods).

On the one hand, it's the scheduler that made a mistake, so we shouldn't
"punish" a domain by giving it a full period with no budget.

On the other hand, if there's a reliable way to trick the scheduler into
allowing a guest to over-run its budget, this allows a rogue guest to
essentially work around the cap.

Know what I mean?

> @@ -2423,14 +2785,22 @@ csched2_runtime(const struct scheduler *ops, int cpu,
>       * credit values of MIN,MAX per vcpu, since each vcpu burns credit
>       * at a different rate.
>       */
> -    if (rt_credit > 0)
> +    if ( rt_credit > 0 )
>          time = c2t(rqd, rt_credit, snext);
>      else
>          time = 0;
>  
> -    /* 3) But never run longer than MAX_TIMER or less than MIN_TIMER or
> -     * the rate_limit time. */
> -    if ( time < min_time)
> +    /*
> +     * 3) But, if capped, never run more than our budget.
> +     */
> +    if ( unlikely(has_cap(snext)) )
> +        time = snext->budget < time ? snext->budget : time;
> +
> +    /*
> +     * 4) But never run longer than MAX_TIMER or less than MIN_TIMER or

Two "buts" in a row doesn't work.  I'd change this to "and", which sort
of follows on from the last "but".

Thanks,
 -George
Dario Faggioli June 23, 2017, 4:19 p.m. UTC | #7
On Thu, 2017-06-22 at 17:55 +0100, George Dunlap wrote:
> On 08/06/17 13:08, Dario Faggioli wrote:
> > This commit implements the Xen part of the cap mechanism for
> > Credit2.
> > 
> > A cap is how much, in terms of % of physical CPU time, a domain
> > can execute at most.
> > 
> > For instance, a domain that must not use more than 1/4 of one
> > physical CPU, must have a cap of 25%; one that must not use more
> > than 1+1/2 of physical CPU time, must be given a cap of 150%.
> > 
> > Caps are per domain, so it is all a domain's vCPUs, cumulatively,
> > that will be forced to execute no more than the decided amount.
> > 
> > This is implemented by giving each domain a 'budget', and using
> > a (per-domain again) periodic timer. Values of budget and 'period'
> > are chosen so that budget/period is equal to the cap itself.
> > 
> > Budget is burned by the domain's vCPUs, in a similar way to how
> > credits are.
> > 
> > When a domain runs out of budget, its vCPUs can't run any longer.
> > They can gain, when the budget is replenishment by the timer, which
> > event happens once every period.
> > 
> > Blocking the vCPUs because of lack of budget happens by
> > means of a new (_VPF_parked) pause flag, so that, e.g.,
> > vcpu_runnable() still works. This is similar to what is
> > done in sched_rtds.c, as opposed to what happens in
> > sched_credit.c, where vcpu_pause() and vcpu_unpause()
> > (which means, among other things, more overhead).
> > 
> > Note that xenalyze and tools/xentrace/format are also modified,
> > to keep them updated with one modified event.
> > 
> > Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
> 
> Looks really good overall, Dario!  Just a few relatively minor
> comments.
> 
Cool! Thanks for having a look so quickly. :-)

> > --- a/xen/common/sched_credit2.c
> > +++ b/xen/common/sched_credit2.c
> > @@ -92,6 +92,82 @@
> > + *   Budget never expires, so at whatever time a vCPU wants to
> > run, it can
> > + *   check the domain's budget, and if there is some, it can use
> > it.
> 
> I'm not sure what this paragraph is trying to say. Saying budget
> "never
> expires" makes it sound like you continue to accumulate it, such that
> if
> you don't run at all for several periods, you could "save it up" and
> run
> at 100% for one full period.
> 
Yes, I see what you mean, and I agree that it's unclear. The point is
that there exists algorithm where the budget of a replenishment has an
expiry time, before the next replenishment, i.e., it has to be used
right away after the replenishment, or (usually), shortly after, or it
will be thrown away, and we'll have to wait next replenishment.

But again, yes, saying "never expires", here, and in the way I did it,
and without any reference and comparison to those other algorithms,
makes people think what you just said.

I'll rephrase (or cut the paragraph entirely, it's probably not super
informative/useful anyway). :-)

> > + * - when a budget replenishment occurs, if there are vCPUs that
> > had been
> > + *   blocked because of lack of budget, they'll be unblocked, and
> > they will
> > + *   (potentially) be able to run again.
> > + *
> > + * Finally, some even more implementation related detail:
> > + *
> > + * - budget is stored in a domain-wide pool. vCPUs of the domain
> > that want
> > + *   to run go to such pool, and grub some. When they do so, the
> > amount
> > + *   they grabbed is _immediately_ removed from the pool. This
> > happens in
> > + *   vcpu_try_to_get_budget();
> 
> This sounds like a good solution to the "greedy vcpu" problem. :-)
> 
I like it too. It works because it's coupled with the fact that
runqueue is ordered by credits. E.g., doing the same in Credit1, would
probably still not work, because of the round-robin queues (within same
priority), and because of the boosting on wakeup.

> > @@ -1438,6 +1570,217 @@ void burn_credits(struct
> > +static bool vcpu_try_to_get_budget(struct csched2_vcpu *svc)
> 
> This name is OK, but it's a bit long.  What about
> "vcpu_grab_budget()"?
> 
Ok (and the other renaming suggestions too).

> > +{
> > +    struct csched2_dom *sdom = svc->sdom;
> > +    unsigned int cpu = svc->vcpu->processor;
> > +
> > +    ASSERT(spin_is_locked(per_cpu(schedule_data,
> > cpu).schedule_lock));
> > +
> > +    if ( svc->budget > 0 )
> > +        return true;
> > +
> > +    /* budget_lock nests inside runqueue lock. */
> > +    spin_lock(&sdom->budget_lock);
> > +
> > +    /*
> > +     * Here, svc->budget is <= 0 (as, if it was > 0, we'd have
> > taken the if
> > +     * above!). That basically means the vCPU has overrun a bit --
> > because of
> > +     * various reasons-- and we want to take that into account.
> > With the +=,
> > +     * we are actually subtracting the amount of budget the vCPU
> > has
> > +     * overconsumed, from the total domain budget.
> > +     */
> > +    sdom->budget += svc->budget;
> > +
> > +    if ( sdom->budget > 0 )
> > +    {
> > +        svc->budget = sdom->budget;
> > +        sdom->budget = 0;
> 
> Just a minor comment here -- I didn't see anything in this
> description,
> the cover letter, or this patch indicating that you were planning on
> changing this in a follow-up patch.  A heads-up not to worry about
> apparently allowing only one vcpu to run at a time might have been
> nice. :-)
> 
Right. I'll explain that in the cover, and in a comment here (which
will then go away).

> > +{
> > +    struct csched2_dom *sdom = data;
> > +    unsigned long flags;
> > +    s_time_t now;
> > +    LIST_HEAD(parked);
> > +
> > +    spin_lock_irqsave(&sdom->budget_lock, flags);
> > +
> > +    /*
> > +     * It is possible that the domain overrun, and that the budget
> > hence went
> > +     * below 0 (reasons may be system overbooking, issues in or
> > too coarse
> > +     * runtime accounting, etc.). In particular, if we overrun by
> > more than
> > +     * tot_budget, then budget+tot_budget would still be < 0,
> > which in turn
> > +     * means that, despite replenishment, there's still no budget
> > for unarking
> > +     * and running vCPUs.
> > +     *
> > +     * It is also possible that we are handling the replenishment
> > much later
> > +     * than expected (reasons may again be overbooking, or issues
> > with timers).
> > +     * If we are more than CSCHED2_BDGT_REPL_PERIOD late, this
> > means we have
> > +     * basically skipped (at least) one replenishment.
> > +     *
> > +     * We deal with both the issues here, by, basically, doing
> > more than just
> > +     * one replenishment. Note, however, that every time we add
> > tot_budget
> > +     * to the budget, we also move next_repl away by
> > CSCHED2_BDGT_REPL_PERIOD.
> > +     * This guarantees we always respect the cap.
> > +     */
> > +    now = NOW();
> > +    do
> > +    {
> > +        sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD;
> > +        sdom->budget += sdom->tot_budget;
> > +    }
> > +    while ( sdom->next_repl <= now || sdom->budget <= 0 );
> 
> The first clause ("oops, accidentally missed a replenishment period")
> I
> agree with; 
>
Ok.

> but I'm going back and forth a bit on the second one.  It
> means essentially that the scheduler made a mistake and allowed the
> VM
> to run for one full budget *more* than its allocated time (perhaps
> accumulated over several periods).
> 
No, the budget does not accumulate. Or at least, it does, but only up
to the original tot_budget.

So, basically, the reason why budget may still be <0, after a
replenishment of tot_budget, is that something went wrong, and we let
the vcpu overrun for more than tot_budget.

It really should never happen (I may actually add a WARN()), unless the
accounting is very coarse, or the budget is really small (i.e., the
budget is small compared to the resolution we can achieve for the
accounting).

> On the one hand, it's the scheduler that made a mistake, so we
> shouldn't
> "punish" a domain by giving it a full period with no budget.
> 
Yes, I think I understand what you mean. However, I would not
necessarily call this "punishing" the domain. We're just making sure
that cap is enforced, even during (hopefully sporadic and only
transient) tricky circumstances where the scheduler got something
wrong, and for those domains that have (perhaps not maliciously, but
still) already taken advantage of such mistake.

In fact, assume you have a domain that wants to execute W amount of
work every T time, but has a cap that results in it having a budget of
C<<W every T. Under normal circumstances, it executes for C between 0
and T, for C between T and 2T, for C between 2T and 3T, etc., until it
reaches W. So, after 3T, it will have executed for 3C.

In presence of an accounting/enforcing error, it may happen that it
executes for C between 0 and T, for 2C between T and 2T, for 0 between
2T and 3T, etc. So, after 3T, it will also have executed for 3C, as
above.

What I'm trying to say is that, yes, we're making it skip a period, but
that's only because we gave it more before, and it actually used this
additional capacity, we granted by mistake. So, it's not really a
punishment, it's... well.. recognising that we gave it too big of a
reward. And if it had spent that already, we don't go blame it, we just
skip the next one, to make things even.

> On the other hand, if there's a reliable way to trick the scheduler
> into
> allowing a guest to over-run its budget, this allows a rogue guest to
> essentially work around the cap.
> 
Exactly. There really should not be a way of triggering this behavior
but, if someone finds it, this is a safety/robustness measure for
always having cap enforced.

And remember that this only applies if the vcpu has actually overrun.
As in, if the scheduler gets confused and come to do the budget
enforcement late, during the T-2T period, but the vcpu has only
executed for C it (the vcpu) does not have to pay any price for the
scheduler's mistake (that, indeed, would be unfair).

> > @@ -2423,14 +2785,22 @@ csched2_runtime(const struct scheduler
> > *ops, int cpu,
> >       * credit values of MIN,MAX per vcpu, since each vcpu burns
> > credit
> >       * at a different rate.
> >       */
> > -    if (rt_credit > 0)
> > +    if ( rt_credit > 0 )
> >          time = c2t(rqd, rt_credit, snext);
> >      else
> >          time = 0;
> >  
> > -    /* 3) But never run longer than MAX_TIMER or less than
> > MIN_TIMER or
> > -     * the rate_limit time. */
> > -    if ( time < min_time)
> > +    /*
> > +     * 3) But, if capped, never run more than our budget.
> > +     */
> > +    if ( unlikely(has_cap(snext)) )
> > +        time = snext->budget < time ? snext->budget : time;
> > +
> > +    /*
> > +     * 4) But never run longer than MAX_TIMER or less than
> > MIN_TIMER or
> 
> Two "buts" in a row doesn't work.  I'd change this to "and", which
> sort
> of follows on from the last "but".
> 
Ok, will do.

Thanks again and Regards,
Dario
George Dunlap June 28, 2017, 2:28 p.m. UTC | #8
On Fri, Jun 23, 2017 at 5:19 PM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
>> > +{
>> > +    struct csched2_dom *sdom = data;
>> > +    unsigned long flags;
>> > +    s_time_t now;
>> > +    LIST_HEAD(parked);
>> > +
>> > +    spin_lock_irqsave(&sdom->budget_lock, flags);
>> > +
>> > +    /*
>> > +     * It is possible that the domain overrun, and that the budget
>> > hence went
>> > +     * below 0 (reasons may be system overbooking, issues in or
>> > too coarse
>> > +     * runtime accounting, etc.). In particular, if we overrun by
>> > more than
>> > +     * tot_budget, then budget+tot_budget would still be < 0,
>> > which in turn
>> > +     * means that, despite replenishment, there's still no budget
>> > for unarking
>> > +     * and running vCPUs.
>> > +     *
>> > +     * It is also possible that we are handling the replenishment
>> > much later
>> > +     * than expected (reasons may again be overbooking, or issues
>> > with timers).
>> > +     * If we are more than CSCHED2_BDGT_REPL_PERIOD late, this
>> > means we have
>> > +     * basically skipped (at least) one replenishment.
>> > +     *
>> > +     * We deal with both the issues here, by, basically, doing
>> > more than just
>> > +     * one replenishment. Note, however, that every time we add
>> > tot_budget
>> > +     * to the budget, we also move next_repl away by
>> > CSCHED2_BDGT_REPL_PERIOD.
>> > +     * This guarantees we always respect the cap.
>> > +     */
>> > +    now = NOW();
>> > +    do
>> > +    {
>> > +        sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD;
>> > +        sdom->budget += sdom->tot_budget;
>> > +    }
>> > +    while ( sdom->next_repl <= now || sdom->budget <= 0 );
>>
>> The first clause ("oops, accidentally missed a replenishment period")
>> I
>> agree with;
>>
> Ok.
>
>> but I'm going back and forth a bit on the second one.  It
>> means essentially that the scheduler made a mistake and allowed the
>> VM
>> to run for one full budget *more* than its allocated time (perhaps
>> accumulated over several periods).
>>
> No, the budget does not accumulate. Or at least, it does, but only up
> to the original tot_budget.
>
> So, basically, the reason why budget may still be <0, after a
> replenishment of tot_budget, is that something went wrong, and we let
> the vcpu overrun for more than tot_budget.
>
> It really should never happen (I may actually add a WARN()), unless the
> accounting is very coarse, or the budget is really small (i.e., the
> budget is small compared to the resolution we can achieve for the
> accounting).
>
>> On the one hand, it's the scheduler that made a mistake, so we
>> shouldn't
>> "punish" a domain by giving it a full period with no budget.
>>
> Yes, I think I understand what you mean. However, I would not
> necessarily call this "punishing" the domain. We're just making sure
> that cap is enforced, even during (hopefully sporadic and only
> transient) tricky circumstances where the scheduler got something
> wrong, and for those domains that have (perhaps not maliciously, but
> still) already taken advantage of such mistake.
>
> In fact, assume you have a domain that wants to execute W amount of
> work every T time, but has a cap that results in it having a budget of
> C<<W every T. Under normal circumstances, it executes for C between 0
> and T, for C between T and 2T, for C between 2T and 3T, etc., until it
> reaches W. So, after 3T, it will have executed for 3C.
>
> In presence of an accounting/enforcing error, it may happen that it
> executes for C between 0 and T, for 2C between T and 2T, for 0 between
> 2T and 3T, etc. So, after 3T, it will also have executed for 3C, as
> above.

Right, but is that what the loop actually does?

It looks like now if it executes for 2C between T and 2T, then when
the replenishment happens at 2T, the budget will be -C.  So the first
round of the loop will bring the budget to 0; since budget <= 0, then
it will add *another* C to the budget.  So then it will be allowed to
execute for C again between 2T and 3T, giving it a total of 4C
executed over 3T, in violation of the cap.

Am I reading that wrong?

I thought you had intentionally decided to allow that to happen, to
avoid making the capped domain have to sit out for an entire budget
period.

 -George
Dario Faggioli June 28, 2017, 2:56 p.m. UTC | #9
On Wed, 2017-06-28 at 15:28 +0100, George Dunlap wrote:
> On Fri, Jun 23, 2017 at 5:19 PM, Dario Faggioli
> <dario.faggioli@citrix.com> wrote:
> > > > +{
> > > > +    struct csched2_dom *sdom = data;
> > > > +    unsigned long flags;
> > > > +    s_time_t now;
> > > > +    LIST_HEAD(parked);
> > > > +
> > > > +    spin_lock_irqsave(&sdom->budget_lock, flags);
> > > > +
> > > > +    /*
> > > > +     * It is possible that the domain overrun, and that the
> > > > budget
> > > > hence went
> > > > +     * below 0 (reasons may be system overbooking, issues in
> > > > or
> > > > too coarse
> > > > +     * runtime accounting, etc.). In particular, if we overrun
> > > > by
> > > > more than
> > > > +     * tot_budget, then budget+tot_budget would still be < 0,
> > > > which in turn
> > > > +     * means that, despite replenishment, there's still no
> > > > budget
> > > > for unarking
> > > > +     * and running vCPUs.
> > > > +     *
> > > > +     * It is also possible that we are handling the
> > > > replenishment
> > > > much later
> > > > +     * than expected (reasons may again be overbooking, or
> > > > issues
> > > > with timers).
> > > > +     * If we are more than CSCHED2_BDGT_REPL_PERIOD late, this
> > > > means we have
> > > > +     * basically skipped (at least) one replenishment.
> > > > +     *
> > > > +     * We deal with both the issues here, by, basically, doing
> > > > more than just
> > > > +     * one replenishment. Note, however, that every time we
> > > > add
> > > > tot_budget
> > > > +     * to the budget, we also move next_repl away by
> > > > CSCHED2_BDGT_REPL_PERIOD.
> > > > +     * This guarantees we always respect the cap.
> > > > +     */
> > > > +    now = NOW();
> > > > +    do
> > > > +    {
> > > > +        sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD;
> > > > +        sdom->budget += sdom->tot_budget;
> > > > +    }
> > > > +    while ( sdom->next_repl <= now || sdom->budget <= 0 );
>
> > In presence of an accounting/enforcing error, it may happen that it
> > executes for C between 0 and T, for 2C between T and 2T, for 0
> > between
> > 2T and 3T, etc. So, after 3T, it will also have executed for 3C, as
> > above.
> 
> Right, but is that what the loop actually does?
> 
It should be. Well, actually, it does not really do that, but it does
something that I think is equivalent.

I probably should have described it better... Sorry!

> It looks like now if it executes for 2C between T and 2T, then when
> the replenishment happens at 2T, the budget will be -C.  So the first
> round of the loop will bring the budget to 0; since budget <= 0, then
> it will add *another* C to the budget.  So then it will be allowed to
> execute for C again between 2T and 3T, giving it a total of 4C
> executed over 3T, in violation of the cap.
> 
> Am I reading that wrong?
> 
So, the loop body indeed adds C, but it also moves the next
replenishment time ahead by T.

In the case you describe, at 2T, with budget -C, the first round of the
loop will make the budget 0, and set the next replenishment to 3T. As
you say, since budget is 0, and 0 is <= than 0, we stay in the loop for
another round, which sets the budget to C, and the next replenishment
to 4T.

So, in summary, the domain have executed for 3C between 0 and 2T, and
will be given only another C, until 4T, summing up to a total of 4C/4T,
which should be fine.

Or at least, that's what I had in mind... if the code does actually do
something different, then it's a bug. ;-P

> I thought you had intentionally decided to allow that to happen, to
> avoid making the capped domain have to sit out for an entire budget
> period.
> 
Ah, so I completely misunderstood your point, when replying. I thought
*you* were arguing in favour of avoiding the capped domain have to sit
out for an entire budget period! :-D

Thanks and Regards,
Dario
George Dunlap June 28, 2017, 7:05 p.m. UTC | #10
On Wed, Jun 28, 2017 at 3:56 PM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
> On Wed, 2017-06-28 at 15:28 +0100, George Dunlap wrote:
>> On Fri, Jun 23, 2017 at 5:19 PM, Dario Faggioli
>> <dario.faggioli@citrix.com> wrote:
>> > > > +{
>> > > > +    struct csched2_dom *sdom = data;
>> > > > +    unsigned long flags;
>> > > > +    s_time_t now;
>> > > > +    LIST_HEAD(parked);
>> > > > +
>> > > > +    spin_lock_irqsave(&sdom->budget_lock, flags);
>> > > > +
>> > > > +    /*
>> > > > +     * It is possible that the domain overrun, and that the
>> > > > budget
>> > > > hence went
>> > > > +     * below 0 (reasons may be system overbooking, issues in
>> > > > or
>> > > > too coarse
>> > > > +     * runtime accounting, etc.). In particular, if we overrun
>> > > > by
>> > > > more than
>> > > > +     * tot_budget, then budget+tot_budget would still be < 0,
>> > > > which in turn
>> > > > +     * means that, despite replenishment, there's still no
>> > > > budget
>> > > > for unarking
>> > > > +     * and running vCPUs.
>> > > > +     *
>> > > > +     * It is also possible that we are handling the
>> > > > replenishment
>> > > > much later
>> > > > +     * than expected (reasons may again be overbooking, or
>> > > > issues
>> > > > with timers).
>> > > > +     * If we are more than CSCHED2_BDGT_REPL_PERIOD late, this
>> > > > means we have
>> > > > +     * basically skipped (at least) one replenishment.
>> > > > +     *
>> > > > +     * We deal with both the issues here, by, basically, doing
>> > > > more than just
>> > > > +     * one replenishment. Note, however, that every time we
>> > > > add
>> > > > tot_budget
>> > > > +     * to the budget, we also move next_repl away by
>> > > > CSCHED2_BDGT_REPL_PERIOD.
>> > > > +     * This guarantees we always respect the cap.
>> > > > +     */
>> > > > +    now = NOW();
>> > > > +    do
>> > > > +    {
>> > > > +        sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD;
>> > > > +        sdom->budget += sdom->tot_budget;
>> > > > +    }
>> > > > +    while ( sdom->next_repl <= now || sdom->budget <= 0 );
>>
>> > In presence of an accounting/enforcing error, it may happen that it
>> > executes for C between 0 and T, for 2C between T and 2T, for 0
>> > between
>> > 2T and 3T, etc. So, after 3T, it will also have executed for 3C, as
>> > above.
>>
>> Right, but is that what the loop actually does?
>>
> It should be. Well, actually, it does not really do that, but it does
> something that I think is equivalent.
>
> I probably should have described it better... Sorry!
>
>> It looks like now if it executes for 2C between T and 2T, then when
>> the replenishment happens at 2T, the budget will be -C.  So the first
>> round of the loop will bring the budget to 0; since budget <= 0, then
>> it will add *another* C to the budget.  So then it will be allowed to
>> execute for C again between 2T and 3T, giving it a total of 4C
>> executed over 3T, in violation of the cap.
>>
>> Am I reading that wrong?
>>
> So, the loop body indeed adds C, but it also moves the next
> replenishment time ahead by T.
>
> In the case you describe, at 2T, with budget -C, the first round of the
> loop will make the budget 0, and set the next replenishment to 3T. As
> you say, since budget is 0, and 0 is <= than 0, we stay in the loop for
> another round, which sets the budget to C, and the next replenishment
> to 4T.

Ah, right -- I did notice that next_repl was being moved forward each
iteration too, but didn't connect that it would mean you'd end up
going for 2T without calling repl_sdom_budget() again.

So I'm convinced that this behavior won't break the credit system, but
I'm not sure why it's an advantage over just having the domain "sit
out" this time period.

Did you actually see this happen in practice on a regular basis during
your tests, or is this mostly a hypothetical situation we're talking
about here?

>> I thought you had intentionally decided to allow that to happen, to
>> avoid making the capped domain have to sit out for an entire budget
>> period.
>>
> Ah, so I completely misunderstood your point, when replying. I thought
> *you* were arguing in favour of avoiding the capped domain have to sit
> out for an entire budget period! :-D

Right, I think that's a natural interpretation given the difference in
understanding. :-)

Peace,
 -George
Dario Faggioli June 29, 2017, 10:09 a.m. UTC | #11
On Wed, 2017-06-28 at 20:05 +0100, George Dunlap wrote:
> On Wed, Jun 28, 2017 at 3:56 PM, Dario Faggioli
> <dario.faggioli@citrix.com> wrote:
> > 
> > In the case you describe, at 2T, with budget -C, the first round of
> > the
> > loop will make the budget 0, and set the next replenishment to 3T.
> > As
> > you say, since budget is 0, and 0 is <= than 0, we stay in the loop
> > for
> > another round, which sets the budget to C, and the next
> > replenishment
> > to 4T.
> 
> Ah, right -- I did notice that next_repl was being moved forward each
> iteration too, but didn't connect that it would mean you'd end up
> going for 2T without calling repl_sdom_budget() again.
> 
> So I'm convinced that this behavior won't break the credit system,
> but
> I'm not sure why it's an advantage over just having the domain "sit
> out" this time period.
> 
I think they're just two different possible implementations, and it's
hard to tell, in general, which one is better and which one is worse

Depending on very specific characteristics of the workload, in
combination with what actually caused the specific occurrence of such a
big overrun, and maybe even other factors, either one may behave better
or worse.

E.g., if the vcpu is "greedy", and always tries to run, as soon as it
has some budget, the difference between the two solution is _where_ we
put the "sitting period".

> Did you actually see this happen in practice on a regular basis
> during
> your tests, or is this mostly a hypothetical situation we're talking
> about here?
> 
It's a safety catch. It really should not happen except from bugs or
unless something is really wrong (e.g., in HW). It may happen more
regularly if the user manages to set a budget that is really small,
compared to the actual resolution of the budget enforcing logic (which
is whatever time granularity we use for timers, in that specific host).

There are measures already in place to try to prevent that to happen,
but I probably can add some more... including a WARN() in this very
case.

Regards,
Dario
George Dunlap July 25, 2017, 2:34 p.m. UTC | #12
On 06/29/2017 11:09 AM, Dario Faggioli wrote:
> On Wed, 2017-06-28 at 20:05 +0100, George Dunlap wrote:
>> On Wed, Jun 28, 2017 at 3:56 PM, Dario Faggioli
>> <dario.faggioli@citrix.com> wrote:
>>>
>>> In the case you describe, at 2T, with budget -C, the first round of
>>> the
>>> loop will make the budget 0, and set the next replenishment to 3T.
>>> As
>>> you say, since budget is 0, and 0 is <= than 0, we stay in the loop
>>> for
>>> another round, which sets the budget to C, and the next
>>> replenishment
>>> to 4T.
>>
>> Ah, right -- I did notice that next_repl was being moved forward each
>> iteration too, but didn't connect that it would mean you'd end up
>> going for 2T without calling repl_sdom_budget() again.
>>
>> So I'm convinced that this behavior won't break the credit system,
>> but
>> I'm not sure why it's an advantage over just having the domain "sit
>> out" this time period.
>>
> I think they're just two different possible implementations, and it's
> hard to tell, in general, which one is better and which one is worse
> 
> Depending on very specific characteristics of the workload, in
> combination with what actually caused the specific occurrence of such a
> big overrun, and maybe even other factors, either one may behave better
> or worse.
> 
> E.g., if the vcpu is "greedy", and always tries to run, as soon as it
> has some budget, the difference between the two solution is _where_ we
> put the "sitting period".

Sorry, dropped this thread to prepare for XenSummit.

Well I think there's a principle a bit like Ockham's Razor: In general,
if there's not an obvious advantage between two implementations, the
simpler / easier to understand implementation is better.

My sense is also that in general, as long as context switching overhead
doesn't become an issue, allowing a guest to run for shorter bursts more
often is better than allowing it to run for longer bursts less often.  A
cpu-bound task will perform about as well in both cases, but a
latency-sensitive task will perform much worse if it's allowed to burn
up its timeslice and forced to wait for a big chunk of time.

On the other hand, my intuitions about how to improve AFL fuzzing these
last few weeks have turned out to be consistently wrong, so at the
moment I'm not inclined to be overly confident in my intuition without
some testing. :-)

I guess in summary: my *advice* would continue to be to not loop until
there's credit, but to allow it to "sit out" a budget period sooner
rather than later.  But if you continue to be convinced that the loop is
better, I'll give my Ack.

 -George
George Dunlap July 25, 2017, 3:08 p.m. UTC | #13
On 06/08/2017 01:08 PM, Dario Faggioli wrote:
> This commit implements the Xen part of the cap mechanism for
> Credit2.
> 
> A cap is how much, in terms of % of physical CPU time, a domain
> can execute at most.
> 
> For instance, a domain that must not use more than 1/4 of one
> physical CPU, must have a cap of 25%; one that must not use more
> than 1+1/2 of physical CPU time, must be given a cap of 150%.
> 
> Caps are per domain, so it is all a domain's vCPUs, cumulatively,
> that will be forced to execute no more than the decided amount.
> 
> This is implemented by giving each domain a 'budget', and using
> a (per-domain again) periodic timer. Values of budget and 'period'
> are chosen so that budget/period is equal to the cap itself.
> 
> Budget is burned by the domain's vCPUs, in a similar way to how
> credits are.
> 
> When a domain runs out of budget, its vCPUs can't run any longer.
> They can gain, when the budget is replenishment by the timer, which
> event happens once every period.
> 
> Blocking the vCPUs because of lack of budget happens by
> means of a new (_VPF_parked) pause flag, so that, e.g.,
> vcpu_runnable() still works. This is similar to what is
> done in sched_rtds.c, as opposed to what happens in
> sched_credit.c, where vcpu_pause() and vcpu_unpause()
> (which means, among other things, more overhead).
> 
> Note that xenalyze and tools/xentrace/format are also modified,
> to keep them updated with one modified event.
> 
> Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
> ---
> Cc: George Dunlap <george.dunlap@eu.citrix.com>
> Cc: Anshul Makkar <anshul.makkar@citrix.com>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> Cc: Jan Beulich <jbeulich@suse.com>
> Cc: Ian Jackson <ian.jackson@eu.citrix.com>
> Cc: Wei Liu <wei.liu2@citrix.com>
> ---
>  tools/xentrace/formats     |    2 
>  tools/xentrace/xenalyze.c  |   10 +
>  xen/common/sched_credit2.c |  470 +++++++++++++++++++++++++++++++++++++++++---
>  xen/include/xen/sched.h    |    3 
>  4 files changed, 445 insertions(+), 40 deletions(-)
> 
> diff --git a/tools/xentrace/formats b/tools/xentrace/formats
> index 8b31780..142b0cf 100644
> --- a/tools/xentrace/formats
> +++ b/tools/xentrace/formats
> @@ -51,7 +51,7 @@
>  
>  0x00022201  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:tick
>  0x00022202  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:runq_pos       [ dom:vcpu = 0x%(1)08x, pos = %(2)d]
> -0x00022203  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:credit burn    [ dom:vcpu = 0x%(1)08x, credit = %(2)d, delta = %(3)d ]
> +0x00022203  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:credit burn    [ dom:vcpu = 0x%(1)08x, credit = %(2)d, budget = %(3)d, delta = %(4)d ]
>  0x00022204  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:credit_add
>  0x00022205  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:tickle_check   [ dom:vcpu = 0x%(1)08x, credit = %(2)d ]
>  0x00022206  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:tickle         [ cpu = %(1)d ]
> diff --git a/tools/xentrace/xenalyze.c b/tools/xentrace/xenalyze.c
> index fa608ad..c16c02d 100644
> --- a/tools/xentrace/xenalyze.c
> +++ b/tools/xentrace/xenalyze.c
> @@ -7680,12 +7680,14 @@ void sched_process(struct pcpu_info *p)
>              if(opt.dump_all) {
>                  struct {
>                      unsigned int vcpuid:16, domid:16;
> -                    int credit, delta;
> +                    int credit, budget, delta;
>                  } *r = (typeof(r))ri->d;
>  
> -                printf(" %s csched2:burn_credits d%uv%u, credit = %d, delta = %d\n",
> -                       ri->dump_header, r->domid, r->vcpuid,
> -                       r->credit, r->delta);
> +                printf(" %s csched2:burn_credits d%uv%u, credit = %d, ",
> +                       ri->dump_header, r->domid, r->vcpuid, r->credit);
> +                if ( r->budget != INT_MIN )
> +                    printf("budget = %d, ", r->budget);
> +                printf("delta = %d\n", r->delta);
>              }
>              break;
>          case TRC_SCHED_CLASS_EVT(CSCHED2, 5): /* TICKLE_CHECK      */
> diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
> index 126417c..ba4bf4b 100644
> --- a/xen/common/sched_credit2.c
> +++ b/xen/common/sched_credit2.c
> @@ -92,6 +92,82 @@
>   */
>  
>  /*
> + * Utilization cap:
> + *
> + * Setting an pCPU utilization cap for a domain means the following:
> + *
> + * - a domain can have a cap, expressed in terms of % of physical CPU time.
> + *   A domain that must not use more than 1/4 of _one_ physical CPU, will
> + *   be given a cap of 25%; a domain that must not use more than 1+1/2 of
> + *   physical CPU time, will be given a cap of 150%;
> + *
> + * - caps are per-domain (not per-vCPU). If a domain has only 1 vCPU, and
> + *   a 40% cap, that one vCPU will use 40% of one pCPU. If a somain has 4
> + *   vCPUs, and a 200% cap, all its 4 vCPUs are allowed to run for (the
> + *   equivalent of) 100% time on 2 pCPUs. How much each of the various 4
> + *   vCPUs will get, is unspecified (will depend on various aspects: workload,
> + *   system load, etc.).
> + *
> + * For implementing this, we use the following approach:
> + *
> + * - each domain is given a 'budget', an each domain has a timer, which
> + *   replenishes the domain's budget periodically. The budget is the amount
> + *   of time the vCPUs of the domain can use every 'period';
> + *
> + * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for all domains
> + *   (but each domain has its own timer; so the all are periodic by the same
> + *   period, but replenishment of the budgets of the various domains, at
> + *   periods boundaries, are not synchronous);
> + *
> + * - when vCPUs run, they consume budget. When they don't run, they don't
> + *   consume budget. If there is no budget left for the domain, no vCPU of
> + *   that domain can run. If a vCPU tries to run and finds that there is no
> + *   budget, it blocks.
> + *   Budget never expires, so at whatever time a vCPU wants to run, it can
> + *   check the domain's budget, and if there is some, it can use it.
> + *
> + * - budget is replenished to the top of the capacity for the domain once
> + *   per period. Even if there was some leftover budget from previous period,
> + *   though, the budget after a replenishment will always be at most equal
> + *   to the total capacify of the domain ('tot_budget');
> + *
> + * - when a budget replenishment occurs, if there are vCPUs that had been
> + *   blocked because of lack of budget, they'll be unblocked, and they will
> + *   (potentially) be able to run again.
> + *
> + * Finally, some even more implementation related detail:
> + *
> + * - budget is stored in a domain-wide pool. vCPUs of the domain that want
> + *   to run go to such pool, and grub some. When they do so, the amount
> + *   they grabbed is _immediately_ removed from the pool. This happens in
> + *   vcpu_try_to_get_budget();
> + *
> + * - when vCPUs stop running, if they've not consumed all the budget they
> + *   took, the leftover is put back in the pool. This happens in
> + *   vcpu_give_budget_back();
> + *
> + * - the above means that a vCPU can find out that there is no budget and
> + *   block, not only if the cap has actually been reached (for this period),
> + *   but also if some other vCPUs, in order to run, have grabbed a certain
> + *   quota of budget, no matter whether they've already used it all or not.
> + *   A vCPU blocking because (any form of) lack of budget is said to be
> + *   "parked", and such blocking happens in park_vcpu();
> + *
> + * - when a vCPU stops running, and puts back some budget in the domain pool,
> + *   we need to check whether there is someone which has been parked and that
> + *   can be unparked. This happens in unpark_parked_vcpus(), called from
> + *   csched2_context_saved();
> + *
> + * - of course, unparking happens also as a consequene of the domain's budget
> + *   being replenished by the periodic timer. This also occurs by means of
> + *   calling csched2_context_saved() (but from repl_sdom_budget());
> + *
> + * - parked vCPUs of a domain are kept in a (per-domain) list, called
> + *   'parked_vcpus'). Manipulation of the list and of the domain-wide budget
> + *   pool, must occur only when holding the 'budget_lock'.
> + */
> +
> +/*
>   * Locking:
>   *
>   * - runqueue lock
> @@ -112,18 +188,29 @@
>   *     runqueue each cpu is;
>   *  + serializes the operation of changing the weights of domains;
>   *
> + * - Budget lock
> + *  + it is per-domain;
> + *  + protects, in domains that have an utilization cap;
> + *   * manipulation of the total budget of the domain (as it is shared
> + *     among all vCPUs of the domain),
> + *   * manipulation of the list of vCPUs that are blocked waiting for
> + *     some budget to be available.
> + *
>   * - Type:
>   *  + runqueue locks are 'regular' spinlocks;
>   *  + the private scheduler lock can be an rwlock. In fact, data
>   *    it protects is modified only during initialization, cpupool
>   *    manipulation and when changing weights, and read in all
> - *    other cases (e.g., during load balancing).
> + *    other cases (e.g., during load balancing);
> + *  + budget locks are 'regular' spinlocks.
>   *
>   * Ordering:
>   *  + tylock must be used when wanting to take a runqueue lock,
>   *    if we already hold another one;
>   *  + if taking both a runqueue lock and the private scheduler
> - *    lock is, the latter must always be taken for first.
> + *    lock is, the latter must always be taken for first;
> + *  + if taking both a runqueue lock and a budget lock, the former
> + *    must always be taken for first.
>   */
>  
>  /*
> @@ -164,6 +251,8 @@
>  #define CSCHED2_CREDIT_RESET         0
>  /* Max timer: Maximum time a guest can be run for. */
>  #define CSCHED2_MAX_TIMER            CSCHED2_CREDIT_INIT
> +/* Period of the cap replenishment timer. */
> +#define CSCHED2_BDGT_REPL_PERIOD     ((opt_cap_period)*MILLISECS(1))
>  
>  /*
>   * Flags
> @@ -293,6 +382,14 @@ static int __read_mostly opt_underload_balance_tolerance = 0;
>  integer_param("credit2_balance_under", opt_underload_balance_tolerance);
>  static int __read_mostly opt_overload_balance_tolerance = -3;
>  integer_param("credit2_balance_over", opt_overload_balance_tolerance);
> +/*
> + * Domains subject to a cap, receive a replenishment of their runtime budget
> + * once every opt_cap_period interval. Default is 10 ms. The amount of budget
> + * they receive depends on their cap. For instance, a domain with a 50% cap
> + * will receive 50% of 10 ms, so 5 ms.
> + */
> +static unsigned int __read_mostly opt_cap_period = 10;    /* ms */
> +integer_param("credit2_cap_period_ms", opt_cap_period);
>  
>  /*
>   * Runqueue organization.
> @@ -408,6 +505,10 @@ struct csched2_vcpu {
>      unsigned int residual;
>  
>      int credit;
> +
> +    s_time_t budget;
> +    struct list_head parked_elem;      /* On the parked_vcpus list   */
> +
>      s_time_t start_time; /* When we were scheduled (used for credit) */
>      unsigned flags;      /* 16 bits doesn't seem to play well with clear_bit() */
>      int tickled_cpu;     /* cpu tickled for picking us up (-1 if none) */
> @@ -425,7 +526,15 @@ struct csched2_vcpu {
>  struct csched2_dom {
>      struct list_head sdom_elem;
>      struct domain *dom;
> +
> +    spinlock_t budget_lock;
> +    struct timer repl_timer;
> +    s_time_t next_repl;
> +    s_time_t budget, tot_budget;
> +    struct list_head parked_vcpus;
> +
>      uint16_t weight;
> +    uint16_t cap;

Hmm, this needs to be rebased on the structure layout patches I checked
in last week. :-)

 -George
Dario Faggioli July 25, 2017, 4:05 p.m. UTC | #14
On Tue, 2017-07-25 at 16:08 +0100, George Dunlap wrote:
> On 06/08/2017 01:08 PM, Dario Faggioli wrote:
> > 
> Hmm, this needs to be rebased on the structure layout patches I
> checked
> in last week. :-)
> 
Indeed it does. And of course, I'm up for it. :-)

The soft-affinity series is likely to generate less conflicts... It's
probably easier if we merge that first (after thaking care of patches 3
and 6, of course), isn't it?

Regards,
Dario
Dario Faggioli July 25, 2017, 5:29 p.m. UTC | #15
On Tue, 2017-07-25 at 15:34 +0100, George Dunlap wrote:
> On 06/29/2017 11:09 AM, Dario Faggioli wrote:
> > E.g., if the vcpu is "greedy", and always tries to run, as soon as
> > it
> > has some budget, the difference between the two solution is _where_
> > we
> > put the "sitting period".
> 
> Sorry, dropped this thread to prepare for XenSummit.
> 
NP.

> Well I think there's a principle a bit like Ockham's Razor: In
> general,
> if there's not an obvious advantage between two implementations, the
> simpler / easier to understand implementation is better.
> 
> My sense is also that in general, as long as context switching
> overhead
> doesn't become an issue, allowing a guest to run for shorter bursts
> more
> often is better than allowing it to run for longer bursts less
> often.  A
> cpu-bound task will perform about as well in both cases, but a
> latency-sensitive task will perform much worse if it's allowed to
> burn
> up its timeslice and forced to wait for a big chunk of time.
> 
That's actually a fair point.

> On the other hand, my intuitions about how to improve AFL fuzzing
> these
> last few weeks have turned out to be consistently wrong, so at the
> moment I'm not inclined to be overly confident in my intuition
> without
> some testing. :-)
> 
Well, this is a similar situation to the one in the other thread. It's
an edge case that, likely, will never or only very rarely occur. So, we
don't need to think too much about it, and we well even afford to
follow your intuition! :-D

Mmm... Maybe this came out a bit less of a "cheering you up", as I
wanted it to be, but hey, I tried! :-P :-P :-P

No, jokes apart, I like your argument about the effect this may have on
latency sensitive workload, if they happen to suffer from one of this
overrun, so I will change the code the way you suggest. :-)

Thanks and Regards,
Dario
diff mbox

Patch

diff --git a/tools/xentrace/formats b/tools/xentrace/formats
index 8b31780..142b0cf 100644
--- a/tools/xentrace/formats
+++ b/tools/xentrace/formats
@@ -51,7 +51,7 @@ 
 
 0x00022201  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:tick
 0x00022202  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:runq_pos       [ dom:vcpu = 0x%(1)08x, pos = %(2)d]
-0x00022203  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:credit burn    [ dom:vcpu = 0x%(1)08x, credit = %(2)d, delta = %(3)d ]
+0x00022203  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:credit burn    [ dom:vcpu = 0x%(1)08x, credit = %(2)d, budget = %(3)d, delta = %(4)d ]
 0x00022204  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:credit_add
 0x00022205  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:tickle_check   [ dom:vcpu = 0x%(1)08x, credit = %(2)d ]
 0x00022206  CPU%(cpu)d  %(tsc)d (+%(reltsc)8d)  csched2:tickle         [ cpu = %(1)d ]
diff --git a/tools/xentrace/xenalyze.c b/tools/xentrace/xenalyze.c
index fa608ad..c16c02d 100644
--- a/tools/xentrace/xenalyze.c
+++ b/tools/xentrace/xenalyze.c
@@ -7680,12 +7680,14 @@  void sched_process(struct pcpu_info *p)
             if(opt.dump_all) {
                 struct {
                     unsigned int vcpuid:16, domid:16;
-                    int credit, delta;
+                    int credit, budget, delta;
                 } *r = (typeof(r))ri->d;
 
-                printf(" %s csched2:burn_credits d%uv%u, credit = %d, delta = %d\n",
-                       ri->dump_header, r->domid, r->vcpuid,
-                       r->credit, r->delta);
+                printf(" %s csched2:burn_credits d%uv%u, credit = %d, ",
+                       ri->dump_header, r->domid, r->vcpuid, r->credit);
+                if ( r->budget != INT_MIN )
+                    printf("budget = %d, ", r->budget);
+                printf("delta = %d\n", r->delta);
             }
             break;
         case TRC_SCHED_CLASS_EVT(CSCHED2, 5): /* TICKLE_CHECK      */
diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index 126417c..ba4bf4b 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -92,6 +92,82 @@ 
  */
 
 /*
+ * Utilization cap:
+ *
+ * Setting an pCPU utilization cap for a domain means the following:
+ *
+ * - a domain can have a cap, expressed in terms of % of physical CPU time.
+ *   A domain that must not use more than 1/4 of _one_ physical CPU, will
+ *   be given a cap of 25%; a domain that must not use more than 1+1/2 of
+ *   physical CPU time, will be given a cap of 150%;
+ *
+ * - caps are per-domain (not per-vCPU). If a domain has only 1 vCPU, and
+ *   a 40% cap, that one vCPU will use 40% of one pCPU. If a somain has 4
+ *   vCPUs, and a 200% cap, all its 4 vCPUs are allowed to run for (the
+ *   equivalent of) 100% time on 2 pCPUs. How much each of the various 4
+ *   vCPUs will get, is unspecified (will depend on various aspects: workload,
+ *   system load, etc.).
+ *
+ * For implementing this, we use the following approach:
+ *
+ * - each domain is given a 'budget', an each domain has a timer, which
+ *   replenishes the domain's budget periodically. The budget is the amount
+ *   of time the vCPUs of the domain can use every 'period';
+ *
+ * - the period is CSCHED2_BDGT_REPL_PERIOD, and is the same for all domains
+ *   (but each domain has its own timer; so the all are periodic by the same
+ *   period, but replenishment of the budgets of the various domains, at
+ *   periods boundaries, are not synchronous);
+ *
+ * - when vCPUs run, they consume budget. When they don't run, they don't
+ *   consume budget. If there is no budget left for the domain, no vCPU of
+ *   that domain can run. If a vCPU tries to run and finds that there is no
+ *   budget, it blocks.
+ *   Budget never expires, so at whatever time a vCPU wants to run, it can
+ *   check the domain's budget, and if there is some, it can use it.
+ *
+ * - budget is replenished to the top of the capacity for the domain once
+ *   per period. Even if there was some leftover budget from previous period,
+ *   though, the budget after a replenishment will always be at most equal
+ *   to the total capacify of the domain ('tot_budget');
+ *
+ * - when a budget replenishment occurs, if there are vCPUs that had been
+ *   blocked because of lack of budget, they'll be unblocked, and they will
+ *   (potentially) be able to run again.
+ *
+ * Finally, some even more implementation related detail:
+ *
+ * - budget is stored in a domain-wide pool. vCPUs of the domain that want
+ *   to run go to such pool, and grub some. When they do so, the amount
+ *   they grabbed is _immediately_ removed from the pool. This happens in
+ *   vcpu_try_to_get_budget();
+ *
+ * - when vCPUs stop running, if they've not consumed all the budget they
+ *   took, the leftover is put back in the pool. This happens in
+ *   vcpu_give_budget_back();
+ *
+ * - the above means that a vCPU can find out that there is no budget and
+ *   block, not only if the cap has actually been reached (for this period),
+ *   but also if some other vCPUs, in order to run, have grabbed a certain
+ *   quota of budget, no matter whether they've already used it all or not.
+ *   A vCPU blocking because (any form of) lack of budget is said to be
+ *   "parked", and such blocking happens in park_vcpu();
+ *
+ * - when a vCPU stops running, and puts back some budget in the domain pool,
+ *   we need to check whether there is someone which has been parked and that
+ *   can be unparked. This happens in unpark_parked_vcpus(), called from
+ *   csched2_context_saved();
+ *
+ * - of course, unparking happens also as a consequene of the domain's budget
+ *   being replenished by the periodic timer. This also occurs by means of
+ *   calling csched2_context_saved() (but from repl_sdom_budget());
+ *
+ * - parked vCPUs of a domain are kept in a (per-domain) list, called
+ *   'parked_vcpus'). Manipulation of the list and of the domain-wide budget
+ *   pool, must occur only when holding the 'budget_lock'.
+ */
+
+/*
  * Locking:
  *
  * - runqueue lock
@@ -112,18 +188,29 @@ 
  *     runqueue each cpu is;
  *  + serializes the operation of changing the weights of domains;
  *
+ * - Budget lock
+ *  + it is per-domain;
+ *  + protects, in domains that have an utilization cap;
+ *   * manipulation of the total budget of the domain (as it is shared
+ *     among all vCPUs of the domain),
+ *   * manipulation of the list of vCPUs that are blocked waiting for
+ *     some budget to be available.
+ *
  * - Type:
  *  + runqueue locks are 'regular' spinlocks;
  *  + the private scheduler lock can be an rwlock. In fact, data
  *    it protects is modified only during initialization, cpupool
  *    manipulation and when changing weights, and read in all
- *    other cases (e.g., during load balancing).
+ *    other cases (e.g., during load balancing);
+ *  + budget locks are 'regular' spinlocks.
  *
  * Ordering:
  *  + tylock must be used when wanting to take a runqueue lock,
  *    if we already hold another one;
  *  + if taking both a runqueue lock and the private scheduler
- *    lock is, the latter must always be taken for first.
+ *    lock is, the latter must always be taken for first;
+ *  + if taking both a runqueue lock and a budget lock, the former
+ *    must always be taken for first.
  */
 
 /*
@@ -164,6 +251,8 @@ 
 #define CSCHED2_CREDIT_RESET         0
 /* Max timer: Maximum time a guest can be run for. */
 #define CSCHED2_MAX_TIMER            CSCHED2_CREDIT_INIT
+/* Period of the cap replenishment timer. */
+#define CSCHED2_BDGT_REPL_PERIOD     ((opt_cap_period)*MILLISECS(1))
 
 /*
  * Flags
@@ -293,6 +382,14 @@  static int __read_mostly opt_underload_balance_tolerance = 0;
 integer_param("credit2_balance_under", opt_underload_balance_tolerance);
 static int __read_mostly opt_overload_balance_tolerance = -3;
 integer_param("credit2_balance_over", opt_overload_balance_tolerance);
+/*
+ * Domains subject to a cap, receive a replenishment of their runtime budget
+ * once every opt_cap_period interval. Default is 10 ms. The amount of budget
+ * they receive depends on their cap. For instance, a domain with a 50% cap
+ * will receive 50% of 10 ms, so 5 ms.
+ */
+static unsigned int __read_mostly opt_cap_period = 10;    /* ms */
+integer_param("credit2_cap_period_ms", opt_cap_period);
 
 /*
  * Runqueue organization.
@@ -408,6 +505,10 @@  struct csched2_vcpu {
     unsigned int residual;
 
     int credit;
+
+    s_time_t budget;
+    struct list_head parked_elem;      /* On the parked_vcpus list   */
+
     s_time_t start_time; /* When we were scheduled (used for credit) */
     unsigned flags;      /* 16 bits doesn't seem to play well with clear_bit() */
     int tickled_cpu;     /* cpu tickled for picking us up (-1 if none) */
@@ -425,7 +526,15 @@  struct csched2_vcpu {
 struct csched2_dom {
     struct list_head sdom_elem;
     struct domain *dom;
+
+    spinlock_t budget_lock;
+    struct timer repl_timer;
+    s_time_t next_repl;
+    s_time_t budget, tot_budget;
+    struct list_head parked_vcpus;
+
     uint16_t weight;
+    uint16_t cap;
     uint16_t nr_vcpus;
 };
 
@@ -460,6 +569,12 @@  static inline struct csched2_runqueue_data *c2rqd(const struct scheduler *ops,
     return &csched2_priv(ops)->rqd[c2r(ops, cpu)];
 }
 
+/* Does the domain of this vCPU have a cap? */
+static inline bool has_cap(const struct csched2_vcpu *svc)
+{
+    return svc->budget != STIME_MAX;
+}
+
 /*
  * Hyperthreading (SMT) support.
  *
@@ -1354,7 +1469,16 @@  static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now,
          * that the credit it has spent so far get accounted.
          */
         if ( svc->vcpu == curr_on_cpu(svc_cpu) )
+        {
             burn_credits(rqd, svc, now);
+            /*
+             * And, similarly, in case it has run out of budget, as a
+             * consequence of this round of accounting, we also must inform
+             * its pCPU that it's time to park it, and pick up someone else.
+             */
+            if ( unlikely(svc->budget <= 0) )
+                tickle_cpu(svc_cpu, rqd);
+        }
 
         start_credit = svc->credit;
 
@@ -1410,27 +1534,35 @@  void burn_credits(struct csched2_runqueue_data *rqd,
 
     delta = now - svc->start_time;
 
-    if ( likely(delta > 0) )
-    {
-        SCHED_STAT_CRANK(burn_credits_t2c);
-        t2c_update(rqd, delta, svc);
-        svc->start_time = now;
-    }
-    else if ( delta < 0 )
+    if ( unlikely(delta <= 0) )
     {
-        d2printk("WARNING: %s: Time went backwards? now %"PRI_stime" start_time %"PRI_stime"\n",
-                 __func__, now, svc->start_time);
+        if ( unlikely(delta < 0) )
+            d2printk("WARNING: %s: Time went backwards? now %"PRI_stime
+                     " start_time %"PRI_stime"\n", __func__, now,
+                     svc->start_time);
+        goto out;
     }
 
+    SCHED_STAT_CRANK(burn_credits_t2c);
+    t2c_update(rqd, delta, svc);
+
+    if ( unlikely(svc->budget != STIME_MAX) )
+        svc->budget -= delta;
+
+    svc->start_time = now;
+
+ out:
     if ( unlikely(tb_init_done) )
     {
         struct {
             unsigned vcpu:16, dom:16;
-            int credit, delta;
+            int credit, budget;
+            int delta;
         } d;
         d.dom = svc->vcpu->domain->domain_id;
         d.vcpu = svc->vcpu->vcpu_id;
         d.credit = svc->credit;
+        d.budget = has_cap(svc) ?  svc->budget : INT_MIN;
         d.delta = delta;
         __trace_var(TRC_CSCHED2_CREDIT_BURN, 1,
                     sizeof(d),
@@ -1438,6 +1570,217 @@  void burn_credits(struct csched2_runqueue_data *rqd,
     }
 }
 
+/*
+ * Budget-related code.
+ */
+
+static void park_vcpu(struct csched2_vcpu *svc)
+{
+    struct vcpu *v = svc->vcpu;
+
+    ASSERT(spin_is_locked(&svc->sdom->budget_lock));
+
+    /*
+     * It was impossible to find budget for this vCPU, so it has to be
+     * "parked". This implies it is not runnable, so we mark it as such in
+     * its pause_flags. If the vCPU is currently scheduled (which means we
+     * are here after being called from within csched_schedule()), flagging
+     * is enough, as we'll choose someone else, and then context_saved()
+     * will take care of updating the load properly.
+     *
+     * If, OTOH, the vCPU is sitting in the runqueue (which means we are here
+     * after being called from within runq_candidate()), we must go all the
+     * way down to taking it out of there, and updating the load accordingly.
+     *
+     * In both cases, we also add it to the list of parked vCPUs of the domain.
+     */
+    __set_bit(_VPF_parked, &v->pause_flags);
+    if ( vcpu_on_runq(svc) )
+    {
+        runq_remove(svc);
+        update_load(svc->sdom->dom->cpupool->sched, svc->rqd, svc, -1, NOW());
+    }
+    list_add(&svc->parked_elem, &svc->sdom->parked_vcpus);
+}
+
+static bool vcpu_try_to_get_budget(struct csched2_vcpu *svc)
+{
+    struct csched2_dom *sdom = svc->sdom;
+    unsigned int cpu = svc->vcpu->processor;
+
+    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
+
+    if ( svc->budget > 0 )
+        return true;
+
+    /* budget_lock nests inside runqueue lock. */
+    spin_lock(&sdom->budget_lock);
+
+    /*
+     * Here, svc->budget is <= 0 (as, if it was > 0, we'd have taken the if
+     * above!). That basically means the vCPU has overrun a bit --because of
+     * various reasons-- and we want to take that into account. With the +=,
+     * we are actually subtracting the amount of budget the vCPU has
+     * overconsumed, from the total domain budget.
+     */
+    sdom->budget += svc->budget;
+
+    if ( sdom->budget > 0 )
+    {
+        svc->budget = sdom->budget;
+        sdom->budget = 0;
+    }
+    else
+    {
+        svc->budget = 0;
+        park_vcpu(svc);
+    }
+
+    spin_unlock(&sdom->budget_lock);
+
+    return svc->budget > 0;
+}
+
+static void
+vcpu_give_budget_back(struct csched2_vcpu *svc, struct list_head *parked)
+{
+    struct csched2_dom *sdom = svc->sdom;
+    unsigned int cpu = svc->vcpu->processor;
+
+    ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
+    ASSERT(list_empty(parked));
+
+    /* budget_lock nests inside runqueue lock. */
+    spin_lock(&sdom->budget_lock);
+
+    /*
+     * The vCPU is stopping running (e.g., because it's blocking, or it has
+     * been preempted). If it hasn't consumed all the budget it got when,
+     * starting to run, put that remaining amount back in the domain's budget
+     * pool.
+     */
+    sdom->budget += svc->budget;
+    svc->budget = 0;
+
+    /*
+     * Making budget available again to the domain means that parked vCPUs
+     * may be unparked and run. They are, if any, in the domain's parked_vcpus
+     * list, so we want to go through that and unpark them (so they can try
+     * to get some budget).
+     *
+     * Touching the list requires the budget_lock, which we hold. Let's
+     * therefore put everyone in that list in another, temporary list, which
+     * then the caller will traverse, unparking the vCPUs it finds there.
+     *
+     * In fact, we can't do the actual unparking here, because that requires
+     * taking the runqueue lock of the vCPUs being unparked, and we can't
+     * take any runqueue locks while we hold a budget_lock.
+     */
+    if ( sdom->budget > 0 )
+        list_splice_init(&sdom->parked_vcpus, parked);
+
+    spin_unlock(&sdom->budget_lock);
+}
+
+static void
+unpark_parked_vcpus(const struct scheduler *ops, struct list_head *vcpus)
+{
+    struct csched2_vcpu *svc, *tmp;
+    spinlock_t *lock;
+
+    list_for_each_entry_safe(svc, tmp, vcpus, parked_elem)
+    {
+        unsigned long flags;
+        s_time_t now;
+
+        lock = vcpu_schedule_lock_irqsave(svc->vcpu, &flags);
+
+        __clear_bit(_VPF_parked, &svc->vcpu->pause_flags);
+        if ( unlikely(svc->flags & CSFLAG_scheduled) )
+        {
+            /*
+             * We end here if a budget replenishment arrived between
+             * csched2_schedule() (and, in particular, after a call to
+             * vcpu_try_to_get_budget() that returned false), and
+             * context_saved(). By setting __CSFLAG_delayed_runq_add,
+             * we tell context_saved() to put the vCPU back in the
+             * runqueue, from where it will compete with the others
+             * for the newly replenished budget.
+             */
+            ASSERT( svc->rqd != NULL );
+            ASSERT( c2rqd(ops, svc->vcpu->processor) == svc->rqd );
+            __set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
+        }
+        else if ( vcpu_runnable(svc->vcpu) )
+        {
+            /*
+             * The vCPU should go back to the runqueue, and compete for
+             * the newly replenished budget, but only if it is actually
+             * runnable (and was therefore offline only because of the
+             * lack of budget).
+             */
+            now = NOW();
+            update_load(ops, svc->rqd, svc, 1, now);
+            runq_insert(ops, svc);
+            runq_tickle(ops, svc, now);
+        }
+        list_del_init(&svc->parked_elem);
+
+        vcpu_schedule_unlock_irqrestore(lock, flags, svc->vcpu);
+    }
+}
+
+static void repl_sdom_budget(void* data)
+{
+    struct csched2_dom *sdom = data;
+    unsigned long flags;
+    s_time_t now;
+    LIST_HEAD(parked);
+
+    spin_lock_irqsave(&sdom->budget_lock, flags);
+
+    /*
+     * It is possible that the domain overrun, and that the budget hence went
+     * below 0 (reasons may be system overbooking, issues in or too coarse
+     * runtime accounting, etc.). In particular, if we overrun by more than
+     * tot_budget, then budget+tot_budget would still be < 0, which in turn
+     * means that, despite replenishment, there's still no budget for unarking
+     * and running vCPUs.
+     *
+     * It is also possible that we are handling the replenishment much later
+     * than expected (reasons may again be overbooking, or issues with timers).
+     * If we are more than CSCHED2_BDGT_REPL_PERIOD late, this means we have
+     * basically skipped (at least) one replenishment.
+     *
+     * We deal with both the issues here, by, basically, doing more than just
+     * one replenishment. Note, however, that every time we add tot_budget
+     * to the budget, we also move next_repl away by CSCHED2_BDGT_REPL_PERIOD.
+     * This guarantees we always respect the cap.
+     */
+    now = NOW();
+    do
+    {
+        sdom->next_repl += CSCHED2_BDGT_REPL_PERIOD;
+        sdom->budget += sdom->tot_budget;
+    }
+    while ( sdom->next_repl <= now || sdom->budget <= 0 );
+    /* We may have done more replenishments: make sure we didn't overshot. */
+    sdom->budget = min(sdom->budget, sdom->tot_budget);
+
+    /*
+     * As above, let's prepare the temporary list, out of the domain's
+     * parked_vcpus list, now that we hold the budget_lock. Then, drop such
+     * lock, and pass the list to the unparking function.
+     */
+    list_splice_init(&sdom->parked_vcpus, &parked);
+
+    spin_unlock_irqrestore(&sdom->budget_lock, flags);
+
+    unpark_parked_vcpus(sdom->dom->cpupool->sched, &parked);
+
+    set_timer(&sdom->repl_timer, sdom->next_repl);
+}
+
 #ifndef NDEBUG
 static inline void
 csched2_vcpu_check(struct vcpu *vc)
@@ -1497,6 +1840,9 @@  csched2_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
     }
     svc->tickled_cpu = -1;
 
+    svc->budget = STIME_MAX;
+    INIT_LIST_HEAD(&svc->parked_elem);
+
     SCHED_STAT_CRANK(vcpu_alloc);
 
     return svc;
@@ -1593,6 +1939,7 @@  csched2_context_saved(const struct scheduler *ops, struct vcpu *vc)
     struct csched2_vcpu * const svc = csched2_vcpu(vc);
     spinlock_t *lock = vcpu_schedule_lock_irq(vc);
     s_time_t now = NOW();
+    LIST_HEAD(were_parked);
 
     BUG_ON( !is_idle_vcpu(vc) && svc->rqd != c2rqd(ops, vc->processor));
     ASSERT(is_idle_vcpu(vc) || svc->rqd == c2rqd(ops, vc->processor));
@@ -1600,6 +1947,9 @@  csched2_context_saved(const struct scheduler *ops, struct vcpu *vc)
     /* This vcpu is now eligible to be put on the runqueue again */
     __clear_bit(__CSFLAG_scheduled, &svc->flags);
 
+    if ( unlikely(has_cap(svc) && svc->budget > 0) )
+        vcpu_give_budget_back(svc, &were_parked);
+
     /* If someone wants it on the runqueue, put it there. */
     /*
      * NB: We can get rid of CSFLAG_scheduled by checking for
@@ -1620,6 +1970,8 @@  csched2_context_saved(const struct scheduler *ops, struct vcpu *vc)
         update_load(ops, svc->rqd, svc, -1, now);
 
     vcpu_schedule_unlock_irq(lock, vc);
+
+    unpark_parked_vcpus(ops, &were_parked);
 }
 
 #define MAX_LOAD (STIME_MAX);
@@ -2243,12 +2595,18 @@  csched2_alloc_domdata(const struct scheduler *ops, struct domain *dom)
     if ( sdom == NULL )
         return NULL;
 
-    /* Initialize credit and weight */
+    /* Initialize credit, cap and weight */
     INIT_LIST_HEAD(&sdom->sdom_elem);
     sdom->dom = dom;
     sdom->weight = CSCHED2_DEFAULT_WEIGHT;
+    sdom->cap = 0U;
     sdom->nr_vcpus = 0;
 
+    init_timer(&sdom->repl_timer, repl_sdom_budget, (void*) sdom,
+               cpumask_any(cpupool_domain_cpumask(dom)));
+    spin_lock_init(&sdom->budget_lock);
+    INIT_LIST_HEAD(&sdom->parked_vcpus);
+
     write_lock_irqsave(&prv->lock, flags);
 
     list_add_tail(&sdom->sdom_elem, &csched2_priv(ops)->sdom);
@@ -2284,6 +2642,7 @@  csched2_free_domdata(const struct scheduler *ops, void *data)
 
     write_lock_irqsave(&prv->lock, flags);
 
+    kill_timer(&sdom->repl_timer);
     list_del_init(&sdom->sdom_elem);
 
     write_unlock_irqrestore(&prv->lock, flags);
@@ -2378,11 +2737,12 @@  csched2_runtime(const struct scheduler *ops, int cpu,
         return -1;
 
     /* General algorithm:
-     * 1) Run until snext's credit will be 0
+     * 1) Run until snext's credit will be 0.
      * 2) But if someone is waiting, run until snext's credit is equal
-     * to his
-     * 3) But never run longer than MAX_TIMER or shorter than MIN_TIMER or
-     * the ratelimit time.
+     *    to his.
+     * 3) But, if we are capped, never run more than our budget.
+     * 4) But never run longer than MAX_TIMER or shorter than MIN_TIMER or
+     *    the ratelimit time.
      */
 
     /* Calculate mintime */
@@ -2397,11 +2757,13 @@  csched2_runtime(const struct scheduler *ops, int cpu,
             min_time = ratelimit_min;
     }
 
-    /* 1) Basic time: Run until credit is 0. */
+    /* 1) Run until snext's credit will be 0. */
     rt_credit = snext->credit;
 
-    /* 2) If there's someone waiting whose credit is positive,
-     * run until your credit ~= his */
+    /*
+     * 2) If there's someone waiting whose credit is positive,
+     *    run until your credit ~= his.
+     */
     if ( ! list_empty(runq) )
     {
         struct csched2_vcpu *swait = runq_elem(runq->next);
@@ -2423,14 +2785,22 @@  csched2_runtime(const struct scheduler *ops, int cpu,
      * credit values of MIN,MAX per vcpu, since each vcpu burns credit
      * at a different rate.
      */
-    if (rt_credit > 0)
+    if ( rt_credit > 0 )
         time = c2t(rqd, rt_credit, snext);
     else
         time = 0;
 
-    /* 3) But never run longer than MAX_TIMER or less than MIN_TIMER or
-     * the rate_limit time. */
-    if ( time < min_time)
+    /*
+     * 3) But, if capped, never run more than our budget.
+     */
+    if ( unlikely(has_cap(snext)) )
+        time = snext->budget < time ? snext->budget : time;
+
+    /*
+     * 4) But never run longer than MAX_TIMER or less than MIN_TIMER or
+     *    the rate_limit time.
+     */
+    if ( time < min_time )
     {
         time = min_time;
         SCHED_STAT_CRANK(runtime_min_timer);
@@ -2447,13 +2817,13 @@  csched2_runtime(const struct scheduler *ops, int cpu,
 /*
  * Find a candidate.
  */
-static struct csched2_vcpu *
+static noinline struct csched2_vcpu *
 runq_candidate(struct csched2_runqueue_data *rqd,
                struct csched2_vcpu *scurr,
                int cpu, s_time_t now,
                unsigned int *skipped)
 {
-    struct list_head *iter;
+    struct list_head *iter, *temp;
     struct csched2_vcpu *snext = NULL;
     struct csched2_private *prv = csched2_priv(per_cpu(scheduler, cpu));
     bool yield = __test_and_clear_bit(__CSFLAG_vcpu_yield, &scurr->flags);
@@ -2496,7 +2866,7 @@  runq_candidate(struct csched2_runqueue_data *rqd,
     else
         snext = csched2_vcpu(idle_vcpu[cpu]);
 
-    list_for_each( iter, &rqd->runq )
+    list_for_each_safe( iter, temp, &rqd->runq )
     {
         struct csched2_vcpu * svc = list_entry(iter, struct csched2_vcpu, runq_elem);
 
@@ -2544,11 +2914,13 @@  runq_candidate(struct csched2_runqueue_data *rqd,
         }
 
         /*
-         * If the next one on the list has more credit than current
-         * (or idle, if current is not runnable), or if current is
-         * yielding, choose it.
+         * If the one in the runqueue has more credit than current (or idle,
+         * if current is not runnable), or if current is yielding, and also
+         * if the one in runqueue either is not capped, or is capped but has
+         * some budget, then choose it.
          */
-        if ( yield || svc->credit > snext->credit )
+        if ( (yield || svc->credit > snext->credit) &&
+             (!has_cap(svc) || vcpu_try_to_get_budget(svc)) )
             snext = svc;
 
         /* In any case, if we got this far, break. */
@@ -2575,6 +2947,13 @@  runq_candidate(struct csched2_runqueue_data *rqd,
     if ( unlikely(snext->tickled_cpu != -1 && snext->tickled_cpu != cpu) )
         SCHED_STAT_CRANK(tickled_cpu_overridden);
 
+    /*
+     * If snext is from a capped domain, it must have budget (or it
+     * wouldn't have been in the runq). If it is not, it'd be STIME_MAX,
+     * which still is >= 0.
+     */
+    ASSERT(snext->budget >= 0);
+
     return snext;
 }
 
@@ -2632,10 +3011,18 @@  csched2_schedule(
                     (unsigned char *)&d);
     }
 
-    /* Update credits */
+    /* Update credits (and budget, if necessary). */
     burn_credits(rqd, scurr, now);
 
     /*
+     *  Below 0, means that we are capped and we have overrun our  budget.
+     *  Let's try to get some more but, if we fail (e.g., because of the
+     *  other running vcpus), we will be parked.
+     */
+    if ( unlikely(scurr->budget <= 0) )
+        vcpu_try_to_get_budget(scurr);
+
+    /*
      * Select next runnable local VCPU (ie top of local runq).
      *
      * If the current vcpu is runnable, and has higher credit than
@@ -2769,6 +3156,9 @@  csched2_dump_vcpu(struct csched2_private *prv, struct csched2_vcpu *svc)
 
     printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight);
 
+    if ( has_cap(svc) )
+        printk(" budget=%"PRI_stime, svc->budget);
+
     printk(" load=%"PRI_stime" (~%"PRI_stime"%%)", svc->avgload,
            (svc->avgload * 100) >> prv->load_precision_shift);
 
@@ -2856,9 +3246,10 @@  csched2_dump(const struct scheduler *ops)
 
         sdom = list_entry(iter_sdom, struct csched2_dom, sdom_elem);
 
-        printk("\tDomain: %d w %d v %d\n",
+        printk("\tDomain: %d w %d c %u v %d\n",
                sdom->dom->domain_id,
                sdom->weight,
+               sdom->cap,
                sdom->nr_vcpus);
 
         for_each_vcpu( sdom->dom, v )
@@ -3076,12 +3467,14 @@  csched2_init(struct scheduler *ops)
            XENLOG_INFO " load_window_shift: %d\n"
            XENLOG_INFO " underload_balance_tolerance: %d\n"
            XENLOG_INFO " overload_balance_tolerance: %d\n"
-           XENLOG_INFO " runqueues arrangement: %s\n",
+           XENLOG_INFO " runqueues arrangement: %s\n"
+           XENLOG_INFO " cap enforcement granularity: %dms\n",
            opt_load_precision_shift,
            opt_load_window_shift,
            opt_underload_balance_tolerance,
            opt_overload_balance_tolerance,
-           opt_runqueue_str[opt_runqueue]);
+           opt_runqueue_str[opt_runqueue],
+           opt_cap_period);
 
     if ( opt_load_precision_shift < LOADAVG_PRECISION_SHIFT_MIN )
     {
@@ -3099,6 +3492,13 @@  csched2_init(struct scheduler *ops)
     printk(XENLOG_INFO "load tracking window length %llu ns\n",
            1ULL << opt_load_window_shift);
 
+    if ( CSCHED2_BDGT_REPL_PERIOD < CSCHED2_MIN_TIMER )
+    {
+        printk("WARNING: %s: opt_cap_period %d too small, resetting\n",
+               __func__, opt_cap_period);
+        opt_cap_period = 10; /* ms */
+    }
+
     /* Basically no CPU information is available at this point; just
      * set up basic structures, and a callback when the CPU info is
      * available. */
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 1127ca9..2c7f9cc 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -787,6 +787,9 @@  static inline struct domain *next_domain_in_cpupool(
  /* VCPU is being reset. */
 #define _VPF_in_reset        7
 #define VPF_in_reset         (1UL<<_VPF_in_reset)
+/* VCPU is parked. */
+#define _VPF_parked          8
+#define VPF_parked           (1UL<<_VPF_parked)
 
 static inline int vcpu_runnable(struct vcpu *v)
 {