diff mbox

[v6,for,Xen,4.7,1/4] xen: enable per-VCPU parameter settings for RTDS scheduler

Message ID 1457286958-5427-2-git-send-email-lichong659@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Chong Li March 6, 2016, 5:55 p.m. UTC
Add XEN_DOMCTL_SCHEDOP_getvcpuinfo and _putvcpuinfo hypercalls
to independently get and set the scheduling parameters of each
vCPU of a domain

Signed-off-by: Chong Li <chong.li@wustl.edu>
Signed-off-by: Meng Xu <mengxu@cis.upenn.edu>
Signed-off-by: Sisu Xi <xisisu@gmail.com>

---
Changes on PATCH v5:
1) When processing XEN_DOMCTL_SCHEDOP_get/putvcpuinfo, we do
preemption check in a similar way to XEN_SYSCTL_pcitopoinfo

Changes on PATCH v4:
1) Add uint32_t vcpu_index to struct xen_domctl_scheduler_op.
When processing XEN_DOMCTL_SCHEDOP_get/putvcpuinfo, we call
hypercall_preemption_check in case the current hypercall lasts
too long. If we decide to preempt the current hypercall, we record
the index of the most-recent finished vcpu into the vcpu_index of
struct xen_domctl_scheduler_op. So when we resume the hypercall after
preemption, we start processing from the posion specified by vcpu_index,
and don't need to repeat the work that has already been done in the
hypercall before the preemption.
(This design is based on the do_grant_table_op() in grant_table.c)

2) Coding style changes

Changes on PATCH v3:
1) Remove struct xen_domctl_schedparam_t.

2) Change struct xen_domctl_scheduler_op.

3) Check if period/budget is within a validated range

Changes on PATCH v2:
1) Change struct xen_domctl_scheduler_op, for transferring per-vcpu parameters
between libxc and hypervisor.

2) Handler of XEN_DOMCTL_SCHEDOP_getinfo now just returns the default budget and
period values of RTDS scheduler.

3) Handler of XEN_DOMCTL_SCHEDOP_getvcpuinfo now can return a random subset of
the parameters of the VCPUs of a specific domain

CC: <dario.faggioli@citrix.com>
CC: <george.dunlap@eu.citrix.com>
CC: <dgolomb@seas.upenn.edu>
CC: <mengxu@cis.upenn.edu>
CC: <jbeulich@suse.com>
CC: <lichong659@gmail.com>
---
 xen/common/sched_credit.c   |   4 ++
 xen/common/sched_credit2.c  |   4 ++
 xen/common/sched_rt.c       | 130 +++++++++++++++++++++++++++++++++++++++-----
 xen/common/schedule.c       |  15 ++++-
 xen/include/public/domctl.h |  59 ++++++++++++++++----
 5 files changed, 182 insertions(+), 30 deletions(-)

Comments

Jan Beulich March 7, 2016, 12:59 p.m. UTC | #1
>>> On 06.03.16 at 18:55, <lichong659@gmail.com> wrote:
> --- a/xen/common/sched_credit.c
> +++ b/xen/common/sched_credit.c
> @@ -1054,6 +1054,10 @@ csched_dom_cntl(
>       * lock. Runq lock not needed anywhere in here. */
>      spin_lock_irqsave(&prv->lock, flags);
>  
> +    if ( op->cmd == XEN_DOMCTL_SCHEDOP_putvcpuinfo ||
> +         op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo )
> +        return -EINVAL;
> +
>      if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
>      {
>          op->u.credit.weight = sdom->weight;

Considering the rest of the code following where, I would - albeit
I'm not maintainer of this code - strongly suggest moving to
switch() in such cases, with the default case returning -EINVAL (or
maybe better -EOPNOTSUPP).

> @@ -1130,23 +1146,17 @@ rt_dom_cntl(
>      unsigned long flags;
>      int rc = 0;
>  
> +    xen_domctl_schedparam_vcpu_t local_sched;
> +    s_time_t period, budget;
> +    uint32_t index = 0;
> +

There's a stray blank line left ahead of this addition.

>      switch ( op->cmd )
>      {
> -    case XEN_DOMCTL_SCHEDOP_getinfo:
> -        if ( d->max_vcpus > 0 )
> -        {
> -            spin_lock_irqsave(&prv->lock, flags);
> -            svc = rt_vcpu(d->vcpu[0]);
> -            op->u.rtds.period = svc->period / MICROSECS(1);
> -            op->u.rtds.budget = svc->budget / MICROSECS(1);
> -            spin_unlock_irqrestore(&prv->lock, flags);
> -        }
> -        else
> -        {
> -            /* If we don't have vcpus yet, let's just return the defaults. */
> -            op->u.rtds.period = RTDS_DEFAULT_PERIOD;
> -            op->u.rtds.budget = RTDS_DEFAULT_BUDGET;
> -        }
> +    case XEN_DOMCTL_SCHEDOP_getinfo: /* return the default parameters */
> +        spin_lock_irqsave(&prv->lock, flags);
> +        op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1);
> +        op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1);
> +        spin_unlock_irqrestore(&prv->lock, flags);
>          break;

This alters the values returned when d->max_vcpus == 0 - while
this looks to be intentional, I think calling out such a bug fix in the
description is a must.

> @@ -1163,6 +1173,96 @@ rt_dom_cntl(
>          }
>          spin_unlock_irqrestore(&prv->lock, flags);
>          break;
> +    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
> +        if ( guest_handle_is_null(op->u.v.vcpus) )
> +        {
> +            rc = -EINVAL;

Perhaps rather -EFAULT? But then again - what is this check good for
(considering that it doesn't cover other obviously bad handle values)?

> +            break;
> +        }
> +        while ( index < op->u.v.nr_vcpus )
> +        {
> +            if ( copy_from_guest_offset(&local_sched,
> +                          op->u.v.vcpus, index, 1) )

Indentation.

> +            {
> +                rc = -EFAULT;
> +                break;
> +            }
> +            if ( local_sched.vcpuid >= d->max_vcpus ||
> +                          d->vcpu[local_sched.vcpuid] == NULL )

Again. And more below.

> +            {
> +                rc = -EINVAL;
> +                break;
> +            }
> +
> +            spin_lock_irqsave(&prv->lock, flags);
> +            svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
> +            local_sched.s.rtds.budget = svc->budget / MICROSECS(1);
> +            local_sched.s.rtds.period = svc->period / MICROSECS(1);
> +            spin_unlock_irqrestore(&prv->lock, flags);
> +
> +            if ( __copy_to_guest_offset(op->u.v.vcpus, index,
> +                    &local_sched, 1) )
> +            {
> +                rc = -EFAULT;
> +                break;
> +            }
> +            if ( (++index > 0x3f) && hypercall_preempt_check() )
> +                break;

So how is the caller going to be able to reliably read all vCPU-s'
information for a guest with more than 64 vCPU-s?

> +        }
> +
> +        if ( !rc && (op->u.v.nr_vcpus != index) )
> +            op->u.v.nr_vcpus = index;

I don't think the right side of the && is really necessary / useful.

> +        break;
> +    case XEN_DOMCTL_SCHEDOP_putvcpuinfo:

When switch statements get large, please put blank lines between
individual case blocks.

> +        if ( guest_handle_is_null(op->u.v.vcpus) )
> +        {
> +            rc = -EINVAL;
> +            break;
> +        }
> +        while ( index < op->u.v.nr_vcpus )
> +        {
> +            if ( copy_from_guest_offset(&local_sched,
> +                          op->u.v.vcpus, index, 1) )
> +            {
> +                rc = -EFAULT;
> +                break;
> +            }
> +            if ( local_sched.vcpuid >= d->max_vcpus ||
> +                          d->vcpu[local_sched.vcpuid] == NULL )
> +            {
> +                rc = -EINVAL;
> +                break;
> +            }
> +
> +            period = MICROSECS(local_sched.s.rtds.period);
> +            budget = MICROSECS(local_sched.s.rtds.budget);
> +            if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET ||
> +                          budget > period || period < RTDS_MIN_PERIOD )
> +            {
> +                rc = -EINVAL;
> +                break;
> +            }
> +
> +            /*
> +             * We accept period/budget less than 100 us, but will warn users about
> +             * the large scheduling overhead due to it
> +             */
> +            if ( period < MICROSECS(100) || budget < MICROSECS(100) )
> +                printk("Warning: period or budget set to less than 100us.\n"
> +                       "This may result in high scheduling overhead.\n");

This should use a log level which is rate limited by default. Quite
likely that would be one of the guest log levels.

> --- a/xen/common/schedule.c
> +++ b/xen/common/schedule.c
> @@ -1148,10 +1148,19 @@ long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
>      if ( ret )
>          return ret;
>  
> -    if ( (op->sched_id != DOM2OP(d)->sched_id) ||
> -         ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
> -          (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
> +    if ( op->sched_id != DOM2OP(d)->sched_id )
>          return -EINVAL;
> +    else
> +        switch ( op->cmd )

Pointless else, pointlessly increasing the necessary indentation
for the entire switch().

> +typedef struct xen_domctl_schedparam_vcpu {
> +    union {
> +        xen_domctl_sched_credit_t credit;
> +        xen_domctl_sched_credit2_t credit2;
> +        xen_domctl_sched_rtds_t rtds;
> +    } s;

Please call such unions "u", as done everywhere else.

> +    uint16_t vcpuid;

Any particular reason to limit this to 16 bits, when elsewhere
we commonly use 32 bits for vCPU IDs?

Jan
Chong Li March 7, 2016, 4:28 p.m. UTC | #2
On Mon, Mar 7, 2016 at 6:59 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>> On 06.03.16 at 18:55, <lichong659@gmail.com> wrote:

>
>> @@ -1130,23 +1146,17 @@ rt_dom_cntl(
>>      unsigned long flags;
>>      int rc = 0;
>>
>> +    xen_domctl_schedparam_vcpu_t local_sched;
>> +    s_time_t period, budget;
>> +    uint32_t index = 0;
>> +
>
> There's a stray blank line left ahead of this addition.
>
>>      switch ( op->cmd )
>>      {
>> -    case XEN_DOMCTL_SCHEDOP_getinfo:
>> -        if ( d->max_vcpus > 0 )
>> -        {
>> -            spin_lock_irqsave(&prv->lock, flags);
>> -            svc = rt_vcpu(d->vcpu[0]);
>> -            op->u.rtds.period = svc->period / MICROSECS(1);
>> -            op->u.rtds.budget = svc->budget / MICROSECS(1);
>> -            spin_unlock_irqrestore(&prv->lock, flags);
>> -        }
>> -        else
>> -        {
>> -            /* If we don't have vcpus yet, let's just return the defaults. */
>> -            op->u.rtds.period = RTDS_DEFAULT_PERIOD;
>> -            op->u.rtds.budget = RTDS_DEFAULT_BUDGET;
>> -        }
>> +    case XEN_DOMCTL_SCHEDOP_getinfo: /* return the default parameters */
>> +        spin_lock_irqsave(&prv->lock, flags);
>> +        op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1);
>> +        op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1);
>> +        spin_unlock_irqrestore(&prv->lock, flags);
>>          break;
>
> This alters the values returned when d->max_vcpus == 0 - while
> this looks to be intentional, I think calling out such a bug fix in the
> description is a must.

Based on previous discussion, XEN_DOMCTL_SCHEDOP_getinfo only returns
the default parameters,
no matter whether vcpu is created yet or not. But I can absolutely
explain this in the description.
>
>> @@ -1163,6 +1173,96 @@ rt_dom_cntl(
>>          }
>>          spin_unlock_irqrestore(&prv->lock, flags);
>>          break;
>> +    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
>> +        if ( guest_handle_is_null(op->u.v.vcpus) )
>> +        {
>> +            rc = -EINVAL;
>
> Perhaps rather -EFAULT? But then again - what is this check good for
> (considering that it doesn't cover other obviously bad handle values)?

Dario suggested this in the last post, because vcpus is a handle and
needs to be validated.

>> +            {
>> +                rc = -EINVAL;
>> +                break;
>> +            }
>> +
>> +            spin_lock_irqsave(&prv->lock, flags);
>> +            svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
>> +            local_sched.s.rtds.budget = svc->budget / MICROSECS(1);
>> +            local_sched.s.rtds.period = svc->period / MICROSECS(1);
>> +            spin_unlock_irqrestore(&prv->lock, flags);
>> +
>> +            if ( __copy_to_guest_offset(op->u.v.vcpus, index,
>> +                    &local_sched, 1) )
>> +            {
>> +                rc = -EFAULT;
>> +                break;
>> +            }
>> +            if ( (++index > 0x3f) && hypercall_preempt_check() )
>> +                break;
>
> So how is the caller going to be able to reliably read all vCPU-s'
> information for a guest with more than 64 vCPU-s?

In libxc, we re-issue hypercall if the current one is preempted.

>
>> +        }
>> +
>> +        if ( !rc && (op->u.v.nr_vcpus != index) )
>> +            op->u.v.nr_vcpus = index;
>
> I don't think the right side of the && is really necessary / useful.

The right side is to check whether the vcpus array is fully processed.
When it is true and no error occurs (rc == 0), we
update op->u.v.nr_vcpus, which is returned to libxc, and helps xc
function figuring out how many un-processed vcpus should
be taken care of in the next hypercall.


>
>> +typedef struct xen_domctl_schedparam_vcpu {
>> +    union {
>> +        xen_domctl_sched_credit_t credit;
>> +        xen_domctl_sched_credit2_t credit2;
>> +        xen_domctl_sched_rtds_t rtds;
>> +    } s;
>
> Please call such unions "u", as done everywhere else.
>
>> +    uint16_t vcpuid;
>
> Any particular reason to limit this to 16 bits, when elsewhere
> we commonly use 32 bits for vCPU IDs?

I'll change it.

Thanks for your comments.
Chong
Jan Beulich March 7, 2016, 4:40 p.m. UTC | #3
>>> On 07.03.16 at 17:28, <lichong659@gmail.com> wrote:
> On Mon, Mar 7, 2016 at 6:59 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>>> On 06.03.16 at 18:55, <lichong659@gmail.com> wrote:
>>>      switch ( op->cmd )
>>>      {
>>> -    case XEN_DOMCTL_SCHEDOP_getinfo:
>>> -        if ( d->max_vcpus > 0 )
>>> -        {
>>> -            spin_lock_irqsave(&prv->lock, flags);
>>> -            svc = rt_vcpu(d->vcpu[0]);
>>> -            op->u.rtds.period = svc->period / MICROSECS(1);
>>> -            op->u.rtds.budget = svc->budget / MICROSECS(1);
>>> -            spin_unlock_irqrestore(&prv->lock, flags);
>>> -        }
>>> -        else
>>> -        {
>>> -            /* If we don't have vcpus yet, let's just return the defaults. */
>>> -            op->u.rtds.period = RTDS_DEFAULT_PERIOD;
>>> -            op->u.rtds.budget = RTDS_DEFAULT_BUDGET;
>>> -        }
>>> +    case XEN_DOMCTL_SCHEDOP_getinfo: /* return the default parameters */
>>> +        spin_lock_irqsave(&prv->lock, flags);
>>> +        op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1);
>>> +        op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1);
>>> +        spin_unlock_irqrestore(&prv->lock, flags);
>>>          break;
>>
>> This alters the values returned when d->max_vcpus == 0 - while
>> this looks to be intentional, I think calling out such a bug fix in the
>> description is a must.
> 
> Based on previous discussion, XEN_DOMCTL_SCHEDOP_getinfo only returns
> the default parameters,
> no matter whether vcpu is created yet or not. But I can absolutely
> explain this in the description.

That wasn't the point of the comment. Instead the change (fix) to
divide by MICROSECS(1) is what otherwise would go in silently.

>>> @@ -1163,6 +1173,96 @@ rt_dom_cntl(
>>>          }
>>>          spin_unlock_irqrestore(&prv->lock, flags);
>>>          break;
>>> +    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
>>> +        if ( guest_handle_is_null(op->u.v.vcpus) )
>>> +        {
>>> +            rc = -EINVAL;
>>
>> Perhaps rather -EFAULT? But then again - what is this check good for
>> (considering that it doesn't cover other obviously bad handle values)?
> 
> Dario suggested this in the last post, because vcpus is a handle and
> needs to be validated.

Well, as said - the handle being non-null doesn't make it a valid
handle. Any validation can be left to copy_{to,from}_guest*()
unless you mean to give a null handle some special meaning.

>>> +            {
>>> +                rc = -EINVAL;
>>> +                break;
>>> +            }
>>> +
>>> +            spin_lock_irqsave(&prv->lock, flags);
>>> +            svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
>>> +            local_sched.s.rtds.budget = svc->budget / MICROSECS(1);
>>> +            local_sched.s.rtds.period = svc->period / MICROSECS(1);
>>> +            spin_unlock_irqrestore(&prv->lock, flags);
>>> +
>>> +            if ( __copy_to_guest_offset(op->u.v.vcpus, index,
>>> +                    &local_sched, 1) )
>>> +            {
>>> +                rc = -EFAULT;
>>> +                break;
>>> +            }
>>> +            if ( (++index > 0x3f) && hypercall_preempt_check() )
>>> +                break;
>>
>> So how is the caller going to be able to reliably read all vCPU-s'
>> information for a guest with more than 64 vCPU-s?
> 
> In libxc, we re-issue hypercall if the current one is preempted.

And with the current code - how does libxc know? (And anyway,
this should only be a last resort, if the hypervisor can't by itself
arrange for a continuation. If done this way, having a code
comment referring to the required caller behavior would seem to
be an absolute must.)

>>> +        }
>>> +
>>> +        if ( !rc && (op->u.v.nr_vcpus != index) )
>>> +            op->u.v.nr_vcpus = index;
>>
>> I don't think the right side of the && is really necessary / useful.
> 
> The right side is to check whether the vcpus array is fully processed.
> When it is true and no error occurs (rc == 0), we
> update op->u.v.nr_vcpus, which is returned to libxc, and helps xc
> function figuring out how many un-processed vcpus should
> be taken care of in the next hypercall.

Just consider what the contents of op->u.v.nr_vcpus is after
this piece of code was executed, once with the full conditional,
and another time with the right side of the && omitted.

Jan
Dario Faggioli March 7, 2016, 5:53 p.m. UTC | #4
On Mon, 2016-03-07 at 09:40 -0700, Jan Beulich wrote:
> > > > On 07.03.16 at 17:28, <lichong659@gmail.com> wrote:
> > On Mon, Mar 7, 2016 at 6:59 AM, Jan Beulich <JBeulich@suse.com>
> > wrote:
> > > 
> > > > @@ -1163,6 +1173,96 @@ rt_dom_cntl(
> > > > 
> > > > +    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
> > > > +        if ( guest_handle_is_null(op->u.v.vcpus) )
> > > > +        {
> > > > +            rc = -EINVAL;
> > > Perhaps rather -EFAULT? But then again - what is this check good
> > > for
> > > (considering that it doesn't cover other obviously bad handle
> > > values)?
> > Dario suggested this in the last post, because vcpus is a handle
> > and
> > needs to be validated.
>
> Well, as said - the handle being non-null doesn't make it a valid
> handle. Any validation can be left to copy_{to,from}_guest*()
> unless you mean to give a null handle some special meaning.
> 
IIRC, I was looking at how XEN_SYSCTL_pcitopoinfo is handled, for
reference, and that has some guest_handle_is_null()==>EINVAL sainity
checking (in xen/common/sysctl.c), which, when I thought about it, made
sense to me.

My reasoning was, sort of:
 1. if the handle is NULL, no point getting into the somewhat 
    complicated logic of the while,
 2. more accurate error reporting: as being passed a NULL handler 
    looked something we could identify and call invalid, rather than 
    waiting for the copy to fault.

In any event, I've no problem at all with this being dropped.

> > > > +            {
> > > > +                rc = -EINVAL;
> > > > +                break;
> > > > +            }
> > > > +
> > > > +            spin_lock_irqsave(&prv->lock, flags);
> > > > +            svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
> > > > +            local_sched.s.rtds.budget = svc->budget /
> > > > MICROSECS(1);
> > > > +            local_sched.s.rtds.period = svc->period /
> > > > MICROSECS(1);
> > > > +            spin_unlock_irqrestore(&prv->lock, flags);
> > > > +
> > > > +            if ( __copy_to_guest_offset(op->u.v.vcpus, index,
> > > > +                    &local_sched, 1) )
> > > > +            {
> > > > +                rc = -EFAULT;
> > > > +                break;
> > > > +            }
> > > > +            if ( (++index > 0x3f) && hypercall_preempt_check()
> > > > )
> > > > +                break;
> > > So how is the caller going to be able to reliably read all vCPU-
> > > s'
> > > information for a guest with more than 64 vCPU-s?
> > In libxc, we re-issue hypercall if the current one is preempted.
> And with the current code - how does libxc know? (And anyway,
> this should only be a last resort, if the hypervisor can't by itself
> arrange for a continuation. If done this way, having a code
> comment referring to the required caller behavior would seem to
> be an absolute must.)
> 
I definitely agree on commenting.

About the structure of the code, as said above, I do like
how XEN_SYSCTL_pcitopoinfo ended up being handled, I think it is a
great fit for this specific case and, comparing at both this and
previous version, I do think this one is (bugs apart) looking better.

I'm sure I said this --long ago-- when discussing v4 (and maybe even
previous versions), as well as more recently, when reviewing v5, and
that's why Chong (finally! :-D) did it.

So, with the comment in place (and with bugs fixed :-)), are you (Jan)
ok with this being done this way?

> > > > +        }
> > > > +
> > > > +        if ( !rc && (op->u.v.nr_vcpus != index) )
> > > > +            op->u.v.nr_vcpus = index;
> > > I don't think the right side of the && is really necessary /
> > > useful.
> > The right side is to check whether the vcpus array is fully
> > processed.
> > When it is true and no error occurs (rc == 0), we
> > update op->u.v.nr_vcpus, which is returned to libxc, and helps xc
> > function figuring out how many un-processed vcpus should
> > be taken care of in the next hypercall.
> Just consider what the contents of op->u.v.nr_vcpus is after
> this piece of code was executed, once with the full conditional,
> and another time with the right side of the && omitted.
> 
BTW, Chong, I'm not sure this has to do with what Jan is saying, but
looking again at XEN_SYSCTL_pcitopoinfo, it looks to me you're missing
copying nr_vcpus back up to the guest (which is actually what makes
libxc knows whether all vcpus have been processed or now).

Regards,
Dario
Chong Li March 7, 2016, 10:16 p.m. UTC | #5
On Mon, Mar 7, 2016 at 11:53 AM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
> On Mon, 2016-03-07 at 09:40 -0700, Jan Beulich wrote:
>> > > > On 07.03.16 at 17:28, <lichong659@gmail.com> wrote:
>> > On Mon, Mar 7, 2016 at 6:59 AM, Jan Beulich <JBeulich@suse.com>
>> > wrote:
>> > >
>> > > > @@ -1163,6 +1173,96 @@ rt_dom_cntl(
>> > > >
>> > > > +    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
>> > > > +        if ( guest_handle_is_null(op->u.v.vcpus) )
>> > > > +        {
>> > > > +            rc = -EINVAL;

>
>> > > > +            {
>> > > > +                rc = -EINVAL;
>> > > > +                break;
>> > > > +            }
>> > > > +
>> > > > +            spin_lock_irqsave(&prv->lock, flags);
>> > > > +            svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
>> > > > +            local_sched.s.rtds.budget = svc->budget /
>> > > > MICROSECS(1);
>> > > > +            local_sched.s.rtds.period = svc->period /
>> > > > MICROSECS(1);
>> > > > +            spin_unlock_irqrestore(&prv->lock, flags);
>> > > > +
>> > > > +            if ( __copy_to_guest_offset(op->u.v.vcpus, index,
>> > > > +                    &local_sched, 1) )
>> > > > +            {
>> > > > +                rc = -EFAULT;
>> > > > +                break;
>> > > > +            }
>> > > > +            if ( (++index > 0x3f) && hypercall_preempt_check()
>> > > > )
>> > > > +                break;

>
>> > > > +        }
>> > > > +
>> > > > +        if ( !rc && (op->u.v.nr_vcpus != index) )
>> > > > +            op->u.v.nr_vcpus = index;
>> > > I don't think the right side of the && is really necessary /
>> > > useful.
>> > The right side is to check whether the vcpus array is fully
>> > processed.
>> > When it is true and no error occurs (rc == 0), we
>> > update op->u.v.nr_vcpus, which is returned to libxc, and helps xc
>> > function figuring out how many un-processed vcpus should
>> > be taken care of in the next hypercall.
>> Just consider what the contents of op->u.v.nr_vcpus is after
>> this piece of code was executed, once with the full conditional,
>> and another time with the right side of the && omitted.
>>
> BTW, Chong, I'm not sure this has to do with what Jan is saying, but
> looking again at XEN_SYSCTL_pcitopoinfo, it looks to me you're missing
> copying nr_vcpus back up to the guest (which is actually what makes
> libxc knows whether all vcpus have been processed or now).

I think by "op->u.v.nr_vcpus = index", we already make the new nr_vcpus
seen by the guest (I've verified it).

In the case XEN_DOMCTL_scheduler_op of do_domctl(),
we make "copyback = 1" after calling sched_adjust(), which means all
fields in op (including
the new nr_vcpus) will be copied to u_domctl (at the end of
do_domctl()). This operation
ensures the new nr_vcpus is copied back up to the guest.

Please correct me if my understanding is wrong.

Chong

>
> Regards,
> Dario
> --
> <<This happens because I choose it to happen!>> (Raistlin Majere)
> -----------------------------------------------------------------
> Dario Faggioli, Ph.D, http://about.me/dario.faggioli
> Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
>
Jan Beulich March 8, 2016, 9:10 a.m. UTC | #6
>>> On 07.03.16 at 18:53, <dario.faggioli@citrix.com> wrote:
> On Mon, 2016-03-07 at 09:40 -0700, Jan Beulich wrote:
>> > > > On 07.03.16 at 17:28, <lichong659@gmail.com> wrote:
>> > On Mon, Mar 7, 2016 at 6:59 AM, Jan Beulich <JBeulich@suse.com>
>> > wrote:
>> > > 
>> > > > @@ -1163,6 +1173,96 @@ rt_dom_cntl(
>> > > > 
>> > > > +    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
>> > > > +        if ( guest_handle_is_null(op->u.v.vcpus) )
>> > > > +        {
>> > > > +            rc = -EINVAL;
>> > > Perhaps rather -EFAULT? But then again - what is this check good
>> > > for
>> > > (considering that it doesn't cover other obviously bad handle
>> > > values)?
>> > Dario suggested this in the last post, because vcpus is a handle
>> > and
>> > needs to be validated.
>>
>> Well, as said - the handle being non-null doesn't make it a valid
>> handle. Any validation can be left to copy_{to,from}_guest*()
>> unless you mean to give a null handle some special meaning.
>> 
> IIRC, I was looking at how XEN_SYSCTL_pcitopoinfo is handled, for
> reference, and that has some guest_handle_is_null()==>EINVAL sainity
> checking (in xen/common/sysctl.c), which, when I thought about it, made
> sense to me.
> 
> My reasoning was, sort of:
>  1. if the handle is NULL, no point getting into the somewhat 
>     complicated logic of the while,
>  2. more accurate error reporting: as being passed a NULL handler 
>     looked something we could identify and call invalid, rather than 
>     waiting for the copy to fault.

I think the XEN_SYSCTL_pcitopoinfo was misguided in this respect,
cloning non applicable logic here which returns the number of needed
(array) elements in such a case for a few other operations.

>> > > > +            {
>> > > > +                rc = -EINVAL;
>> > > > +                break;
>> > > > +            }
>> > > > +
>> > > > +            spin_lock_irqsave(&prv->lock, flags);
>> > > > +            svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
>> > > > +            local_sched.s.rtds.budget = svc->budget /
>> > > > MICROSECS(1);
>> > > > +            local_sched.s.rtds.period = svc->period /
>> > > > MICROSECS(1);
>> > > > +            spin_unlock_irqrestore(&prv->lock, flags);
>> > > > +
>> > > > +            if ( __copy_to_guest_offset(op->u.v.vcpus, index,
>> > > > +                    &local_sched, 1) )
>> > > > +            {
>> > > > +                rc = -EFAULT;
>> > > > +                break;
>> > > > +            }
>> > > > +            if ( (++index > 0x3f) && hypercall_preempt_check()
>> > > > )
>> > > > +                break;
>> > > So how is the caller going to be able to reliably read all vCPU-
>> > > s'
>> > > information for a guest with more than 64 vCPU-s?
>> > In libxc, we re-issue hypercall if the current one is preempted.
>> And with the current code - how does libxc know? (And anyway,
>> this should only be a last resort, if the hypervisor can't by itself
>> arrange for a continuation. If done this way, having a code
>> comment referring to the required caller behavior would seem to
>> be an absolute must.)
>> 
> I definitely agree on commenting.
> 
> About the structure of the code, as said above, I do like
> how XEN_SYSCTL_pcitopoinfo ended up being handled, I think it is a
> great fit for this specific case and, comparing at both this and
> previous version, I do think this one is (bugs apart) looking better.
> 
> I'm sure I said this --long ago-- when discussing v4 (and maybe even
> previous versions), as well as more recently, when reviewing v5, and
> that's why Chong (finally! :-D) did it.
> 
> So, with the comment in place (and with bugs fixed :-)), are you (Jan)
> ok with this being done this way?

Well, this _might_ be acceptable for "get" (since the caller
abandoning the sequence of calls prematurely is no problem),
but for "set" it looks less suitable, as similar abandoning would
leave the guest in some inconsistent / unintended state. The
issue with XEN_SYSCTL_pcitopoinfo was, iirc, the lack of a
good way of encoding the continuation information, and while
that would seem applicable here too I'm not sure now whether
doing it the way it was done was the best choice. Clearly
stating (in the public interface header) that certain normally
input-only fields are volatile would allow the continuation to
be handled without tool stack assistance afaict.

>> > > > +        }
>> > > > +
>> > > > +        if ( !rc && (op->u.v.nr_vcpus != index) )
>> > > > +            op->u.v.nr_vcpus = index;
>> > > I don't think the right side of the && is really necessary /
>> > > useful.
>> > The right side is to check whether the vcpus array is fully
>> > processed.
>> > When it is true and no error occurs (rc == 0), we
>> > update op->u.v.nr_vcpus, which is returned to libxc, and helps xc
>> > function figuring out how many un-processed vcpus should
>> > be taken care of in the next hypercall.
>> Just consider what the contents of op->u.v.nr_vcpus is after
>> this piece of code was executed, once with the full conditional,
>> and another time with the right side of the && omitted.
>> 
> BTW, Chong, I'm not sure this has to do with what Jan is saying, but
> looking again at XEN_SYSCTL_pcitopoinfo, it looks to me you're missing
> copying nr_vcpus back up to the guest (which is actually what makes
> libxc knows whether all vcpus have been processed or now).

Indeed that is why the conditional makes sense there, but not here.
And the copying back is already being taken care of by the caller of
sched_adjust().

Jan
Wei Liu March 8, 2016, 7:09 p.m. UTC | #7
On Sun, Mar 06, 2016 at 11:55:55AM -0600, Chong Li wrote:
[...]
> @@ -1163,6 +1173,96 @@ rt_dom_cntl(
>          }
>          spin_unlock_irqrestore(&prv->lock, flags);
>          break;
> +    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
> +        if ( guest_handle_is_null(op->u.v.vcpus) )
> +        {
> +            rc = -EINVAL;
> +            break;
> +        }
> +        while ( index < op->u.v.nr_vcpus )
> +        {
> +            if ( copy_from_guest_offset(&local_sched,
> +                          op->u.v.vcpus, index, 1) )

Indentation.

> +            {
> +                rc = -EFAULT;
> +                break;
> +            }
> +            if ( local_sched.vcpuid >= d->max_vcpus ||
> +                          d->vcpu[local_sched.vcpuid] == NULL )

Ditto.

> +            {
> +                rc = -EINVAL;
> +                break;
> +            }
> +
> +            spin_lock_irqsave(&prv->lock, flags);
> +            svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
> +            local_sched.s.rtds.budget = svc->budget / MICROSECS(1);
> +            local_sched.s.rtds.period = svc->period / MICROSECS(1);
> +            spin_unlock_irqrestore(&prv->lock, flags);
> +
> +            if ( __copy_to_guest_offset(op->u.v.vcpus, index,
> +                    &local_sched, 1) )

Here as well.

You might want to check your editor setting. I won't point out other
places that need fixing.

> +            {
> +                rc = -EFAULT;
> +                break;
> +            }
> +            if ( (++index > 0x3f) && hypercall_preempt_check() )
> +                break;
> +        }
> +
> +        if ( !rc && (op->u.v.nr_vcpus != index) )
> +            op->u.v.nr_vcpus = index;
> +        break;
> +    case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
> +        if ( guest_handle_is_null(op->u.v.vcpus) )
> +        {
> +            rc = -EINVAL;
> +            break;
> +        }
> +        while ( index < op->u.v.nr_vcpus )
> +        {
> +            if ( copy_from_guest_offset(&local_sched,
> +                          op->u.v.vcpus, index, 1) )
> +            {
> +                rc = -EFAULT;
> +                break;
> +            }
> +            if ( local_sched.vcpuid >= d->max_vcpus ||
> +                          d->vcpu[local_sched.vcpuid] == NULL )
> +            {
> +                rc = -EINVAL;
> +                break;
> +            }
> +
> +            period = MICROSECS(local_sched.s.rtds.period);
> +            budget = MICROSECS(local_sched.s.rtds.budget);
> +            if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET ||
> +                          budget > period || period < RTDS_MIN_PERIOD )
> +            {
> +                rc = -EINVAL;
> +                break;
> +            }
> +
> +            /*
> +             * We accept period/budget less than 100 us, but will warn users about
> +             * the large scheduling overhead due to it
> +             */
> +            if ( period < MICROSECS(100) || budget < MICROSECS(100) )
> +                printk("Warning: period or budget set to less than 100us.\n"
> +                       "This may result in high scheduling overhead.\n");
> +

I'm not the maintainer, but I think having printk here is bad idea
because the toolstack can then DoS the hypervisor.


> +            spin_lock_irqsave(&prv->lock, flags);
> +            svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
> +            svc->period = period;
> +            svc->budget = budget;
> +            spin_unlock_irqrestore(&prv->lock, flags);
> +

And this locking pattern seems sub-optimal. You might be able to move
the lock and unlock outside the while loop?

Note that I merely look at this snippet. I don't know that other
constraints you have, so take what I said with a grained of salt.

Wei.
Dario Faggioli March 9, 2016, 4:10 p.m. UTC | #8
On Tue, 2016-03-08 at 19:09 +0000, Wei Liu wrote:
> On Sun, Mar 06, 2016 at 11:55:55AM -0600, Chong Li wrote:

> > +            spin_lock_irqsave(&prv->lock, flags);
> > +            svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
> > +            svc->period = period;
> > +            svc->budget = budget;
> > +            spin_unlock_irqrestore(&prv->lock, flags);
> > +
> And this locking pattern seems sub-optimal. You might be able to move
> the lock and unlock outside the while loop?
> 
Yes, unless I'm missing something, that looks possible to me, and would
save a lot of acquire/release ping pong on the lock.

And yet, I'm not sure doing (for large guests) batches of 64 iterations
with (as of now) interrupts disabled.

I'll think about this...

Thanks and Regards,
Dario
Jan Beulich March 9, 2016, 4:38 p.m. UTC | #9
>>> On 09.03.16 at 17:10, <dario.faggioli@citrix.com> wrote:
> On Tue, 2016-03-08 at 19:09 +0000, Wei Liu wrote:
>> On Sun, Mar 06, 2016 at 11:55:55AM -0600, Chong Li wrote:
>> 
>> > +            spin_lock_irqsave(&prv->lock, flags);
>> > +            svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
>> > +            svc->period = period;
>> > +            svc->budget = budget;
>> > +            spin_unlock_irqrestore(&prv->lock, flags);
>> > +
>> And this locking pattern seems sub-optimal. You might be able to move
>> the lock and unlock outside the while loop?
>> 
> Yes, unless I'm missing something, that looks possible to me, and would
> save a lot of acquire/release ping pong on the lock.

Well, there are guest memory accesses (which may fault) in the
loop body. While this may work right now, I don't think doing so
is a good idea.

> And yet, I'm not sure doing (for large guests) batches of 64 iterations
> with (as of now) interrupts disabled.

This I'd be much less worried about as long as what's inside the
loop body was a reasonably short code sequence.

Jan
Chong Li March 10, 2016, 10:35 p.m. UTC | #10
On Tue, Mar 8, 2016 at 1:09 PM, Wei Liu <wei.liu2@citrix.com> wrote:
> On Sun, Mar 06, 2016 at 11:55:55AM -0600, Chong Li wrote:
> [...]
>> @@ -1163,6 +1173,96 @@ rt_dom_cntl(
>>          }
>>          spin_unlock_irqrestore(&prv->lock, flags);
>>          break;
>> +    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:

>> +
>> +            period = MICROSECS(local_sched.s.rtds.period);
>> +            budget = MICROSECS(local_sched.s.rtds.budget);
>> +            if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET ||
>> +                          budget > period || period < RTDS_MIN_PERIOD )
>> +            {
>> +                rc = -EINVAL;
>> +                break;
>> +            }
>> +
>> +            /*
>> +             * We accept period/budget less than 100 us, but will warn users about
>> +             * the large scheduling overhead due to it
>> +             */
>> +            if ( period < MICROSECS(100) || budget < MICROSECS(100) )
>> +                printk("Warning: period or budget set to less than 100us.\n"
>> +                       "This may result in high scheduling overhead.\n");
>> +
>
> I'm not the maintainer, but I think having printk here is bad idea
> because the toolstack can then DoS the hypervisor.
>
>
> Wei.

So what function should I use here? I see many LOG() calls in libxl,
but I'm not sure whether that can be used here.

Chong
Wei Liu March 10, 2016, 10:50 p.m. UTC | #11
On Thu, Mar 10, 2016 at 04:35:30PM -0600, Chong Li wrote:
> On Tue, Mar 8, 2016 at 1:09 PM, Wei Liu <wei.liu2@citrix.com> wrote:
> > On Sun, Mar 06, 2016 at 11:55:55AM -0600, Chong Li wrote:
> > [...]
> >> @@ -1163,6 +1173,96 @@ rt_dom_cntl(
> >>          }
> >>          spin_unlock_irqrestore(&prv->lock, flags);
> >>          break;
> >> +    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
> 
> >> +
> >> +            period = MICROSECS(local_sched.s.rtds.period);
> >> +            budget = MICROSECS(local_sched.s.rtds.budget);
> >> +            if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET ||
> >> +                          budget > period || period < RTDS_MIN_PERIOD )
> >> +            {
> >> +                rc = -EINVAL;
> >> +                break;
> >> +            }
> >> +
> >> +            /*
> >> +             * We accept period/budget less than 100 us, but will warn users about
> >> +             * the large scheduling overhead due to it
> >> +             */
> >> +            if ( period < MICROSECS(100) || budget < MICROSECS(100) )
> >> +                printk("Warning: period or budget set to less than 100us.\n"
> >> +                       "This may result in high scheduling overhead.\n");
> >> +
> >
> > I'm not the maintainer, but I think having printk here is bad idea
> > because the toolstack can then DoS the hypervisor.
> >
> >
> > Wei.
> 
> So what function should I use here? I see many LOG() calls in libxl,
> but I'm not sure whether that can be used here.
> 

IMHO you just don't log anything here. System administrator probably
won't see it anyway.

If you think this warning is really necessary, move it to xl.

Wei.

> Chong
> 
> 
> -- 
> Chong Li
> Department of Computer Science and Engineering
> Washington University in St.louis
Chong Li March 13, 2016, 5:05 p.m. UTC | #12
On Wed, Mar 9, 2016 at 10:38 AM, Jan Beulich <JBeulich@suse.com> wrote:
>>>> On 09.03.16 at 17:10, <dario.faggioli@citrix.com> wrote:
>> On Tue, 2016-03-08 at 19:09 +0000, Wei Liu wrote:
>>> On Sun, Mar 06, 2016 at 11:55:55AM -0600, Chong Li wrote:
>>>
>>> > +            spin_lock_irqsave(&prv->lock, flags);
>>> > +            svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
>>> > +            svc->period = period;
>>> > +            svc->budget = budget;
>>> > +            spin_unlock_irqrestore(&prv->lock, flags);
>>> > +
>>> And this locking pattern seems sub-optimal. You might be able to move
>>> the lock and unlock outside the while loop?
>>>
>> Yes, unless I'm missing something, that looks possible to me, and would
>> save a lot of acquire/release ping pong on the lock.
>
> Well, there are guest memory accesses (which may fault) in the
> loop body. While this may work right now, I don't think doing so
> is a good idea.

So I still keep my design here?

Dario, Jan and Wei,

I almost finished a new version, but since this part is critical for
the whole patch, let me summarize the feedbacks here. Please correct
me if my understanding is wrong.

1) We don't need "guest_handle_is_null()" check, because null handle
could be used in some special cases. And normally, handle is checked
by copy_from(to)_guest* functions.

2) In domctl.h, add explain for nr_vcpus, because it is used in both
IN and OUT ways.

3) Use printk(XENLOG_G_WARNING ...) here, because of its rate limit feature.

4) Do I still keep the spin_lock inside the loop body?

Chong
Jan Beulich March 14, 2016, 8:37 a.m. UTC | #13
>>> On 13.03.16 at 18:05, <lichong659@gmail.com> wrote:
> So I still keep my design here?
> 
> Dario, Jan and Wei,

Afaic:

> I almost finished a new version, but since this part is critical for
> the whole patch, let me summarize the feedbacks here. Please correct
> me if my understanding is wrong.
> 
> 1) We don't need "guest_handle_is_null()" check, because null handle
> could be used in some special cases. And normally, handle is checked
> by copy_from(to)_guest* functions.

Yes.

> 2) In domctl.h, add explain for nr_vcpus, because it is used in both
> IN and OUT ways.

Yes.

> 3) Use printk(XENLOG_G_WARNING ...) here, because of its rate limit feature.

I'd say XENLOG_G_DEBUG, and even that only if you really think
the message is useful.

> 4) Do I still keep the spin_lock inside the loop body?

I think overall that's the better alternative.

Jan
Dario Faggioli March 14, 2016, 9:07 a.m. UTC | #14
On Thu, 2016-03-10 at 22:50 +0000, Wei Liu wrote:
> On Thu, Mar 10, 2016 at 04:35:30PM -0600, Chong Li wrote:
> > On Tue, Mar 8, 2016 at 1:09 PM, Wei Liu <wei.liu2@citrix.com>
> > wrote:
> > > > @@ -1163,6 +1173,96 @@ rt_dom_cntl(
> > > > 
> > > > +            /*
> > > > +             * We accept period/budget less than 100 us, but
> > > > will warn users about
> > > > +             * the large scheduling overhead due to it
> > > > +             */
> > > > +            if ( period < MICROSECS(100) || budget <
> > > > MICROSECS(100) )
> > > > +                printk("Warning: period or budget set to less
> > > > than 100us.\n"
> > > > +                       "This may result in high scheduling
> > > > overhead.\n");
> > > > +
> > > I'm not the maintainer, but I think having printk here is bad
> > > idea
> > > because the toolstack can then DoS the hypervisor.
> > > 
> > > 
> > > Wei.
> > So what function should I use here? I see many LOG() calls in
> > libxl,
> > but I'm not sure whether that can be used here.
> > 
> IMHO you just don't log anything here. System administrator probably
> won't see it anyway.
> 
> If you think this warning is really necessary, move it to xl.
> 
I do think it adds some value to have this. Moving the printing outside
of Xen would need exporting a symbol for the minimum budget/period that
we think are best chosen, but:
 - this is really an implementation details, that can (potentially)
   vary between different architectures, and change with future Xen 
   version;
 - it would mean to keep the hypervisor and tools symbols in sync. 

So, as I'm saying in another reply in this thread, I'd use a guest,
rate-limited, logging variant, and print it only once, as
countermeasure to log spamming problem.

Regards,
Dario
Dario Faggioli March 14, 2016, 9:10 a.m. UTC | #15
On Mon, 2016-03-14 at 02:37 -0600, Jan Beulich wrote:
> > 
> > > 
> > > > 
> > > > On 13.03.16 at 18:05, <lichong659@gmail.com> wrote:
> > So I still keep my design here?
> > 
> > Dario, Jan and Wei,
> Afaic:
> 
I agree almost on everything too.

About this:

> > 3) Use printk(XENLOG_G_WARNING ...) here, because of its rate limit
> > feature.
> I'd say XENLOG_G_DEBUG, and even that only if you really think
> the message is useful.
> 
I think it has some value to have this in the logs. In fact, someone
that ended up with small values --either by bug/chance, or in general
without a specific need for them-- and is seeing performance/scheduling
issues will still be able to find this.

And printing it somewhere else than in Xen is impractical (see my reply
to Wei).

However, we may well print it just once, as soon as the first vcpu with
potentially problematic parameters is hit, and then silence it.

Linux has things like WARN_ON_ONCE to do this:
http://lxr.free-electrons.com/source/include/asm-generic/bug.h#L109

I don't see anything like it in Xen, so you'd need to either implement
it, or arrange for something with the same effect locally.

If we just print it once, I'd keep it G_WARNING.

Thanks and Regards,
Dario
-- 
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://about.me/dario.faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
Jan Beulich March 14, 2016, 9:15 a.m. UTC | #16
>>> On 14.03.16 at 10:10, <dario.faggioli@citrix.com> wrote:
> On Mon, 2016-03-14 at 02:37 -0600, Jan Beulich wrote:
>> > > > On 13.03.16 at 18:05, <lichong659@gmail.com> wrote:
> About this:
> 
>> > 3) Use printk(XENLOG_G_WARNING ...) here, because of its rate limit
>> > feature.
>> I'd say XENLOG_G_DEBUG, and even that only if you really think
>> the message is useful.
>> 
> I think it has some value to have this in the logs. In fact, someone
> that ended up with small values --either by bug/chance, or in general
> without a specific need for them-- and is seeing performance/scheduling
> issues will still be able to find this.
> 
> And printing it somewhere else than in Xen is impractical (see my reply
> to Wei).
> 
> However, we may well print it just once, as soon as the first vcpu with
> potentially problematic parameters is hit, and then silence it.
> 
> Linux has things like WARN_ON_ONCE to do this:
> http://lxr.free-electrons.com/source/include/asm-generic/bug.h#L109 
> 
> I don't see anything like it in Xen, so you'd need to either implement
> it, or arrange for something with the same effect locally.

One of the reasons we don't (yet) have this is because commonly
(like is the case here) printing such once globally isn't really what
you want - you'd much rather see it once per domain.

Jan
Dario Faggioli March 14, 2016, 10:05 a.m. UTC | #17
On Mon, 2016-03-14 at 03:15 -0600, Jan Beulich wrote:
> > > > On 14.03.16 at 10:10, <dario.faggioli@citrix.com> wrote:
> > 
> > I think it has some value to have this in the logs. In fact,
> > someone
> > that ended up with small values --either by bug/chance, or in
> > general
> > without a specific need for them-- and is seeing
> > performance/scheduling
> > issues will still be able to find this.
> > 
> > And printing it somewhere else than in Xen is impractical (see my
> > reply
> > to Wei).
> > 
> > However, we may well print it just once, as soon as the first vcpu
> > with
> > potentially problematic parameters is hit, and then silence it.
> > 
> > Linux has things like WARN_ON_ONCE to do this:
> > http://lxr.free-electrons.com/source/include/asm-generic/bug.h#L109
> >  
> > 
> > I don't see anything like it in Xen, so you'd need to either
> > implement
> > it, or arrange for something with the same effect locally.
> One of the reasons we don't (yet) have this is because commonly
> (like is the case here) printing such once globally isn't really what
> you want - you'd much rather see it once per domain.
>
Ah, I did not know that was part of the reason, but it makes sense.

However, for this specific case, I was also considering that, and I
agree, if possible/not to difficult to implement, once per domain would
indeed be better.

Regards,
Dario
Chong Li March 15, 2016, 4:22 p.m. UTC | #18
On Mon, Mar 14, 2016 at 5:05 AM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
> On Mon, 2016-03-14 at 03:15 -0600, Jan Beulich wrote:
>> > > > On 14.03.16 at 10:10, <dario.faggioli@citrix.com> wrote:
>> >
>> > I think it has some value to have this in the logs. In fact,
>> > someone
>> > that ended up with small values --either by bug/chance, or in
>> > general
>> > without a specific need for them-- and is seeing
>> > performance/scheduling
>> > issues will still be able to find this.
>> >
>> > And printing it somewhere else than in Xen is impractical (see my
>> > reply
>> > to Wei).
>> >
>> > However, we may well print it just once, as soon as the first vcpu
>> > with
>> > potentially problematic parameters is hit, and then silence it.
>> >
>> > Linux has things like WARN_ON_ONCE to do this:
>> > http://lxr.free-electrons.com/source/include/asm-generic/bug.h#L109
>> >
>> >
>> > I don't see anything like it in Xen, so you'd need to either
>> > implement
>> > it, or arrange for something with the same effect locally.
>> One of the reasons we don't (yet) have this is because commonly
>> (like is the case here) printing such once globally isn't really what
>> you want - you'd much rather see it once per domain.
>>
> Ah, I did not know that was part of the reason, but it makes sense.
>
> However, for this specific case, I was also considering that, and I
> agree, if possible/not to difficult to implement, once per domain would
> indeed be better.
>
It is easy to implement a global "warn_on_once".

To implement a per-domain "warn_on_once", I may need a hash table
(name it as "dom_warned"), and dom_warned[domid] returns true or
false.
I don't know if hash table is supported in Xen. Or do we have any list
structure which is very good at searching?

Another way is to add a "bool" field in struct domain, but I don't
think that would be a good design.

Any ideas?

Thanks,
Chong
Dario Faggioli March 15, 2016, 4:41 p.m. UTC | #19
On Tue, 2016-03-15 at 11:22 -0500, Chong Li wrote:
> On Mon, Mar 14, 2016 at 5:05 AM, Dario Faggioli
> <dario.faggioli@citrix.com> wrote:
> > 
> > However, for this specific case, I was also considering that, and I
> > agree, if possible/not to difficult to implement, once per domain
> > would
> > indeed be better.
> > 
> It is easy to implement a global "warn_on_once".
> 
Sure.

> To implement a per-domain "warn_on_once", I may need a hash table
> (name it as "dom_warned"), and dom_warned[domid] returns true or
> false.
> I don't know if hash table is supported in Xen. Or do we have any
> list
> structure which is very good at searching?
> 
And all this just for having _this_ specific warning printed once per
domain.

No, I don't think it is worth... Or maybe it is, but I don't think it's
fair to ask you to do it. Or maybe, if you're up for doing it, then
you're free to, but I don't think it's fair to ask for it as
prerequisite for this patch. :-)

> Another way is to add a "bool" field in struct domain, but I don't
> think that would be a good design.
> 
Yeah, I like it even less. :-/

We said 'once' and then 'once per domain', but something I'd be fine
with (coupled with keeping G_WARNING) would be 'once per operation'.
Basically, if a domain has 128 vcpus, and an hypercall tries to set all
of them to period=100, budget=50, we just print the warning once. Then,
if after a while the sysadmin tries the same again, we again just log
once, etc.

Doing this seems much easier, as the 'warned' flag could just be a
local variable of the hypercall implementation. I'm quite sure that
would work if there is not any continuation/re-issueing mechanism in
the hypercall in question. BUT in our case there is, so things may be
more complicated... :-/

Had you thought about a solution like this already? If no, can you see
whether there is a nice and easy way to make something like what I just
described above to work in our case?

If we find nothing, we'll think at alternatives, including killing the
waning.

Thanks and Regards,
Dario
Chong Li March 15, 2016, 5:22 p.m. UTC | #20
On Tue, Mar 15, 2016 at 11:41 AM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
> On Tue, 2016-03-15 at 11:22 -0500, Chong Li wrote:
>> On Mon, Mar 14, 2016 at 5:05 AM, Dario Faggioli
>> <dario.faggioli@citrix.com> wrote:
>> >
>
> We said 'once' and then 'once per domain', but something I'd be fine
> with (coupled with keeping G_WARNING) would be 'once per operation'.
> Basically, if a domain has 128 vcpus, and an hypercall tries to set all
> of them to period=100, budget=50, we just print the warning once. Then,
> if after a while the sysadmin tries the same again, we again just log
> once, etc.
>
> Doing this seems much easier, as the 'warned' flag could just be a
> local variable of the hypercall implementation. I'm quite sure that
> would work if there is not any continuation/re-issueing mechanism in
> the hypercall in question. BUT in our case there is, so things may be
> more complicated... :-/
>
> Had you thought about a solution like this already? If no, can you see
> whether there is a nice and easy way to make something like what I just
> described above to work in our case?
>
How about:

We create a global variable in sched_rt.c:
    /* This variable holds its value through hyerpcall re-issueing.
     * When finding vcpu settings with too low budget or period (e.g,
100 us), we print a warning
     * and set this variable "true". No more warnings are printed
until this variable
     * becomes false.
     */
    static bool warned;
Initialize it as "false" in rt_init().
In your example,
we "warned = true" when we find the first vcpu has budget less than
100 us. Outside
of the while loop, we do:
    if ( index == op->u.v.nr_vcpus ) /* no more hypercall re-issueing */
        warned = false;

Chong
Meng Xu March 16, 2016, 3:14 a.m. UTC | #21
On Tue, Mar 15, 2016 at 1:22 PM, Chong Li <lichong659@gmail.com> wrote:
> On Tue, Mar 15, 2016 at 11:41 AM, Dario Faggioli
> <dario.faggioli@citrix.com> wrote:
>> On Tue, 2016-03-15 at 11:22 -0500, Chong Li wrote:
>>> On Mon, Mar 14, 2016 at 5:05 AM, Dario Faggioli
>>> <dario.faggioli@citrix.com> wrote:
>>> >
>>
>> We said 'once' and then 'once per domain', but something I'd be fine
>> with (coupled with keeping G_WARNING) would be 'once per operation'.
>> Basically, if a domain has 128 vcpus, and an hypercall tries to set all
>> of them to period=100, budget=50, we just print the warning once. Then,
>> if after a while the sysadmin tries the same again, we again just log
>> once, etc.
>>
>> Doing this seems much easier, as the 'warned' flag could just be a
>> local variable of the hypercall implementation. I'm quite sure that
>> would work if there is not any continuation/re-issueing mechanism in
>> the hypercall in question. BUT in our case there is, so things may be
>> more complicated... :-/
>>
>> Had you thought about a solution like this already? If no, can you see
>> whether there is a nice and easy way to make something like what I just
>> described above to work in our case?
>>
> How about:
>
> We create a global variable in sched_rt.c:
>     /* This variable holds its value through hyerpcall re-issueing.
>      * When finding vcpu settings with too low budget or period (e.g,
> 100 us), we print a warning
>      * and set this variable "true". No more warnings are printed
> until this variable
>      * becomes false.
>      */
>     static bool warned;
> Initialize it as "false" in rt_init().
> In your example,
> we "warned = true" when we find the first vcpu has budget less than
> 100 us. Outside
> of the while loop, we do:
>     if ( index == op->u.v.nr_vcpus ) /* no more hypercall re-issueing */
>         warned = false;
>
Hi Chong,

I don't think creating a global variable just for the warning thing is
a better idea. Even if we do want such a variable, it should only
occur in rt_dom_cntl() function, since it is only used in
rt_dom_cntl().
Global variable should be used "globally", isn't it. ;-)

Thanks,

Meng
-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/
Chong Li March 16, 2016, 3:32 a.m. UTC | #22
On Tue, Mar 15, 2016 at 10:14 PM, Meng Xu <mengxu@cis.upenn.edu> wrote:
> On Tue, Mar 15, 2016 at 1:22 PM, Chong Li <lichong659@gmail.com> wrote:
>> On Tue, Mar 15, 2016 at 11:41 AM, Dario Faggioli
>> <dario.faggioli@citrix.com> wrote:
>>> On Tue, 2016-03-15 at 11:22 -0500, Chong Li wrote:
>>>> On Mon, Mar 14, 2016 at 5:05 AM, Dario Faggioli
>>>> <dario.faggioli@citrix.com> wrote:
>>>> >
>>>
>>> We said 'once' and then 'once per domain', but something I'd be fine
>>> with (coupled with keeping G_WARNING) would be 'once per operation'.
>>> Basically, if a domain has 128 vcpus, and an hypercall tries to set all
>>> of them to period=100, budget=50, we just print the warning once. Then,
>>> if after a while the sysadmin tries the same again, we again just log
>>> once, etc.
>>>
>>> Doing this seems much easier, as the 'warned' flag could just be a
>>> local variable of the hypercall implementation. I'm quite sure that
>>> would work if there is not any continuation/re-issueing mechanism in
>>> the hypercall in question. BUT in our case there is, so things may be
>>> more complicated... :-/
>>>
>>> Had you thought about a solution like this already? If no, can you see
>>> whether there is a nice and easy way to make something like what I just
>>> described above to work in our case?
>>>
>> How about:
>>
>> We create a global variable in sched_rt.c:
>>     /* This variable holds its value through hyerpcall re-issueing.
>>      * When finding vcpu settings with too low budget or period (e.g,
>> 100 us), we print a warning
>>      * and set this variable "true". No more warnings are printed
>> until this variable
>>      * becomes false.
>>      */
>>     static bool warned;
>> Initialize it as "false" in rt_init().
>> In your example,
>> we "warned = true" when we find the first vcpu has budget less than
>> 100 us. Outside
>> of the while loop, we do:
>>     if ( index == op->u.v.nr_vcpus ) /* no more hypercall re-issueing */
>>         warned = false;
>>
> Hi Chong,
>
> I don't think creating a global variable just for the warning thing is
> a better idea. Even if we do want such a variable, it should only
> occur in rt_dom_cntl() function, since it is only used in
> rt_dom_cntl().
> Global variable should be used "globally", isn't it. ;-)
>
You're right.

If we define

   static bool warned;

at the beginning of rt_dom_cntl(), do we need to initialize it? If without
initialization, I think its default value is "false", which is just
what we need.

Chong
Meng Xu March 16, 2016, 3:43 a.m. UTC | #23
On Tue, Mar 15, 2016 at 11:32 PM, Chong Li <lichong659@gmail.com> wrote:
> On Tue, Mar 15, 2016 at 10:14 PM, Meng Xu <mengxu@cis.upenn.edu> wrote:
>> On Tue, Mar 15, 2016 at 1:22 PM, Chong Li <lichong659@gmail.com> wrote:
>>> On Tue, Mar 15, 2016 at 11:41 AM, Dario Faggioli
>>> <dario.faggioli@citrix.com> wrote:
>>>> On Tue, 2016-03-15 at 11:22 -0500, Chong Li wrote:
>>>>> On Mon, Mar 14, 2016 at 5:05 AM, Dario Faggioli
>>>>> <dario.faggioli@citrix.com> wrote:
>>>>> >
>>>>
>>>> We said 'once' and then 'once per domain', but something I'd be fine
>>>> with (coupled with keeping G_WARNING) would be 'once per operation'.
>>>> Basically, if a domain has 128 vcpus, and an hypercall tries to set all
>>>> of them to period=100, budget=50, we just print the warning once. Then,
>>>> if after a while the sysadmin tries the same again, we again just log
>>>> once, etc.
>>>>
>>>> Doing this seems much easier, as the 'warned' flag could just be a
>>>> local variable of the hypercall implementation. I'm quite sure that
>>>> would work if there is not any continuation/re-issueing mechanism in
>>>> the hypercall in question. BUT in our case there is, so things may be
>>>> more complicated... :-/
>>>>
>>>> Had you thought about a solution like this already? If no, can you see
>>>> whether there is a nice and easy way to make something like what I just
>>>> described above to work in our case?
>>>>
>>> How about:
>>>
>>> We create a global variable in sched_rt.c:
>>>     /* This variable holds its value through hyerpcall re-issueing.
>>>      * When finding vcpu settings with too low budget or period (e.g,
>>> 100 us), we print a warning
>>>      * and set this variable "true". No more warnings are printed
>>> until this variable
>>>      * becomes false.
>>>      */
>>>     static bool warned;
>>> Initialize it as "false" in rt_init().
>>> In your example,
>>> we "warned = true" when we find the first vcpu has budget less than
>>> 100 us. Outside
>>> of the while loop, we do:
>>>     if ( index == op->u.v.nr_vcpus ) /* no more hypercall re-issueing */
>>>         warned = false;
>>>
>> Hi Chong,
>>
>> I don't think creating a global variable just for the warning thing is
>> a better idea. Even if we do want such a variable, it should only
>> occur in rt_dom_cntl() function, since it is only used in
>> rt_dom_cntl().
>> Global variable should be used "globally", isn't it. ;-)
>>
> You're right.
>
> If we define
>
>    static bool warned;
>
> at the beginning of rt_dom_cntl(), do we need to initialize it? If without
> initialization, I think its default value is "false", which is just
> what we need.
>

We need initializing any variable we are going to use, of course. We
should not reply on the compiler to give an initialized value. :-)

Meng
Dario Faggioli March 16, 2016, 8:23 a.m. UTC | #24
On Tue, 2016-03-15 at 23:43 -0400, Meng Xu wrote:
> On Tue, Mar 15, 2016 at 11:32 PM, Chong Li <lichong659@gmail.com>
> wrote:
> > > > How about:
> > > > 
> > > > We create a global variable in sched_rt.c:
> > > >     /* This variable holds its value through hyerpcall re-
> > > > issueing.
> > > >      * When finding vcpu settings with too low budget or period
> > > > (e.g,
> > > > 100 us), we print a warning
> > > >      * and set this variable "true". No more warnings are
> > > > printed
> > > > until this variable
> > > >      * becomes false.
> > > >      */
> > > >     static bool warned;
> > > > Initialize it as "false" in rt_init().
> > > > In your example,
> > > > we "warned = true" when we find the first vcpu has budget less
> > > > than
> > > > 100 us. Outside
> > > > of the while loop, we do:
> > > >     if ( index == op->u.v.nr_vcpus ) /* no more hypercall re-
> > > > issueing */
> > > >         warned = false;
> > > > 
> > > 
> > > 
> > If we define
> > 
> >    static bool warned;
> > 
> > at the beginning of rt_dom_cntl(), do we need to initialize it? If
> > without
> > initialization, I think its default value is "false", which is just
> > what we need.
> > 
> We need initializing any variable we are going to use, of course. We
> should not reply on the compiler to give an initialized value. :-)
>
We need to initialize any variable that would be used uninitialized, if
we don't initialize it. :-)

However, something along the line of a static variable was also what I
was thinking to, but I don't think it works sufficiently well for
justifying it being introduced. And making things work well is proving
to be too hard to keep bothering.

Reasons why I'm saying I don't think it works well are that: (a) there
may be more than one CPU executing this hypercall, and they'd race on
the value of the static flag; (b) what if the hypercall finishes
processing the first lot of 64 vCPUs with the flag set to false, are we
sure it can't be anything than "still false", when the new hypercal,
for the next lot of vCPUs of the same domain, is re-issued?

I continue to think that it could be useful to have this logged, but
I'm leaning toward just killing it for now (and maybe finding another
way to check and warn about the same thing or one of the effects it
produces, later).

Meng, what do you think?



> 
> Meng
Jan Beulich March 16, 2016, 10:48 a.m. UTC | #25
>>> On 16.03.16 at 04:43, <xumengpanda@gmail.com> wrote:
> On Tue, Mar 15, 2016 at 11:32 PM, Chong Li <lichong659@gmail.com> wrote:
>> If we define
>>
>>    static bool warned;
>>
>> at the beginning of rt_dom_cntl(), do we need to initialize it? If without
>> initialization, I think its default value is "false", which is just
>> what we need.
>>
> 
> We need initializing any variable we are going to use, of course. We
> should not reply on the compiler to give an initialized value. :-)

Relying on implicit initialization where the language standard
requires such is more than just optional - if I see code changes
with an explicit zero initializer of a static variable, I always
reply back stating that the initializer is pointless.

Jan
Meng Xu March 16, 2016, 2:37 p.m. UTC | #26
On Wed, Mar 16, 2016 at 4:23 AM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
> On Tue, 2016-03-15 at 23:43 -0400, Meng Xu wrote:
>> On Tue, Mar 15, 2016 at 11:32 PM, Chong Li <lichong659@gmail.com>
>> wrote:
>> > > > How about:
>> > > >
>> > > > We create a global variable in sched_rt.c:
>> > > >     /* This variable holds its value through hyerpcall re-
>> > > > issueing.
>> > > >      * When finding vcpu settings with too low budget or period
>> > > > (e.g,
>> > > > 100 us), we print a warning
>> > > >      * and set this variable "true". No more warnings are
>> > > > printed
>> > > > until this variable
>> > > >      * becomes false.
>> > > >      */
>> > > >     static bool warned;
>> > > > Initialize it as "false" in rt_init().
>> > > > In your example,
>> > > > we "warned = true" when we find the first vcpu has budget less
>> > > > than
>> > > > 100 us. Outside
>> > > > of the while loop, we do:
>> > > >     if ( index == op->u.v.nr_vcpus ) /* no more hypercall re-
>> > > > issueing */
>> > > >         warned = false;
>> > > >
>> > >
>> > >
>> > If we define
>> >
>> >    static bool warned;
>> >
>> > at the beginning of rt_dom_cntl(), do we need to initialize it? If
>> > without
>> > initialization, I think its default value is "false", which is just
>> > what we need.
>> >
>> We need initializing any variable we are going to use, of course. We
>> should not reply on the compiler to give an initialized value. :-)
>>
> We need to initialize any variable that would be used uninitialized, if
> we don't initialize it. :-)
>
> However, something along the line of a static variable was also what I
> was thinking to, but I don't think it works sufficiently well for
> justifying it being introduced. And making things work well is proving
> to be too hard to keep bothering.
>
> Reasons why I'm saying I don't think it works well are that: (a) there
> may be more than one CPU executing this hypercall, and they'd race on
> the value of the static flag; (b) what if the hypercall finishes
> processing the first lot of 64 vCPUs with the flag set to false, are we
> sure it can't be anything than "still false", when the new hypercal,
> for the next lot of vCPUs of the same domain, is re-issued?
>
> I continue to think that it could be useful to have this logged, but
> I'm leaning toward just killing it for now (and maybe finding another
> way to check and warn about the same thing or one of the effects it
> produces, later).
>
> Meng, what do you think?

I'm thinking about if it may not be worthwhile *for now only* to
provide such information with so much effort and the danger of
introducing more serious issues.

Right, race condition occurs on the global variable and I believe we
don't want to encounter this race condition.
So let's just not use the global variable.

We should definitely put a large warning in the wiki for the RTDS
scheduler about the parameter settings. Incorrect setting should never
crash system but may lead to poor real-time performance users want.

Once this patch is done, I will modify the wiki then. (Chong, could
you remind me if I happen to forget?)

Thanks,

Meng

-----------
Meng Xu
PhD Student in Computer and Information Science
University of Pennsylvania
http://www.cis.upenn.edu/~mengxu/
Chong Li March 16, 2016, 2:46 p.m. UTC | #27
On Wed, Mar 16, 2016 at 3:23 AM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
> On Tue, 2016-03-15 at 23:43 -0400, Meng Xu wrote:
>> On Tue, Mar 15, 2016 at 11:32 PM, Chong Li <lichong659@gmail.com>
>> wrote:
>> > > > How about:
>> > > >
>> > > > We create a global variable in sched_rt.c:
>> > > >     /* This variable holds its value through hyerpcall re-
>> > > > issueing.
>> > > >      * When finding vcpu settings with too low budget or period
>> > > > (e.g,
>> > > > 100 us), we print a warning
>> > > >      * and set this variable "true". No more warnings are
>> > > > printed
>> > > > until this variable
>> > > >      * becomes false.
>> > > >      */
>> > > >     static bool warned;
>> > > > Initialize it as "false" in rt_init().
>> > > > In your example,
>> > > > we "warned = true" when we find the first vcpu has budget less
>> > > > than
>> > > > 100 us. Outside
>> > > > of the while loop, we do:
>> > > >     if ( index == op->u.v.nr_vcpus ) /* no more hypercall re-
>> > > > issueing */
>> > > >         warned = false;
>> > > >
>> > >
>> > >
>> > If we define
>> >
>> >    static bool warned;
>> >
>> > at the beginning of rt_dom_cntl(), do we need to initialize it? If
>> > without
>> > initialization, I think its default value is "false", which is just
>> > what we need.
>> >
>> We need initializing any variable we are going to use, of course. We
>> should not reply on the compiler to give an initialized value. :-)
>>
> We need to initialize any variable that would be used uninitialized, if
> we don't initialize it. :-)
>
> However, something along the line of a static variable was also what I
> was thinking to, but I don't think it works sufficiently well for
> justifying it being introduced. And making things work well is proving
> to be too hard to keep bothering.
>
> Reasons why I'm saying I don't think it works well are that: (a) there
> may be more than one CPU executing this hypercall, and they'd race on
> the value of the static flag; (b) what if the hypercall finishes
> processing the first lot of 64 vCPUs with the flag set to false, are we
> sure it can't be anything than "still false", when the new hypercal,
> for the next lot of vCPUs of the same domain, is re-issued?
>
> I continue to think that it could be useful to have this logged, but
> I'm leaning toward just killing it for now (and maybe finding another
> way to check and warn about the same thing or one of the effects it
> produces, later).
>

By "killing it", do you mean we don't do this check nor print the
warning? Or just
print the warning globally once?

Chong
Chong Li March 16, 2016, 2:46 p.m. UTC | #28
On Wed, Mar 16, 2016 at 9:37 AM, Meng Xu <mengxu@cis.upenn.edu> wrote:
> On Wed, Mar 16, 2016 at 4:23 AM, Dario Faggioli
> <dario.faggioli@citrix.com> wrote:
>> On Tue, 2016-03-15 at 23:43 -0400, Meng Xu wrote:
>>> On Tue, Mar 15, 2016 at 11:32 PM, Chong Li <lichong659@gmail.com>
>>> wrote:
>>> > > > How about:
>>> > > >
>>> > > > We create a global variable in sched_rt.c:
>>> > > >     /* This variable holds its value through hyerpcall re-
>>> > > > issueing.
>>> > > >      * When finding vcpu settings with too low budget or period
>>> > > > (e.g,
>>> > > > 100 us), we print a warning
>>> > > >      * and set this variable "true". No more warnings are
>>> > > > printed
>>> > > > until this variable
>>> > > >      * becomes false.
>>> > > >      */
>>> > > >     static bool warned;
>>> > > > Initialize it as "false" in rt_init().
>>> > > > In your example,
>>> > > > we "warned = true" when we find the first vcpu has budget less
>>> > > > than
>>> > > > 100 us. Outside
>>> > > > of the while loop, we do:
>>> > > >     if ( index == op->u.v.nr_vcpus ) /* no more hypercall re-
>>> > > > issueing */
>>> > > >         warned = false;
>>> > > >
>>> > >
>>> > >
>>> > If we define
>>> >
>>> >    static bool warned;
>>> >
>>> > at the beginning of rt_dom_cntl(), do we need to initialize it? If
>>> > without
>>> > initialization, I think its default value is "false", which is just
>>> > what we need.
>>> >
>>> We need initializing any variable we are going to use, of course. We
>>> should not reply on the compiler to give an initialized value. :-)
>>>
>> We need to initialize any variable that would be used uninitialized, if
>> we don't initialize it. :-)
>>
>> However, something along the line of a static variable was also what I
>> was thinking to, but I don't think it works sufficiently well for
>> justifying it being introduced. And making things work well is proving
>> to be too hard to keep bothering.
>>
>> Reasons why I'm saying I don't think it works well are that: (a) there
>> may be more than one CPU executing this hypercall, and they'd race on
>> the value of the static flag; (b) what if the hypercall finishes
>> processing the first lot of 64 vCPUs with the flag set to false, are we
>> sure it can't be anything than "still false", when the new hypercal,
>> for the next lot of vCPUs of the same domain, is re-issued?
>>
>> I continue to think that it could be useful to have this logged, but
>> I'm leaning toward just killing it for now (and maybe finding another
>> way to check and warn about the same thing or one of the effects it
>> produces, later).
>>
>> Meng, what do you think?
>
> I'm thinking about if it may not be worthwhile *for now only* to
> provide such information with so much effort and the danger of
> introducing more serious issues.
>
> Right, race condition occurs on the global variable and I believe we
> don't want to encounter this race condition.
> So let's just not use the global variable.
>
> We should definitely put a large warning in the wiki for the RTDS
> scheduler about the parameter settings. Incorrect setting should never
> crash system but may lead to poor real-time performance users want.
>
> Once this patch is done, I will modify the wiki then. (Chong, could
> you remind me if I happen to forget?)
>
Sure, I'll.

Chong
Dario Faggioli March 16, 2016, 2:53 p.m. UTC | #29
On Wed, 2016-03-16 at 10:37 -0400, Meng Xu wrote:
> On Wed, Mar 16, 2016 at 4:23 AM, Dario Faggioli
> <dario.faggioli@citrix.com> wrote:
> > 
> > I continue to think that it could be useful to have this logged,
> > but
> > I'm leaning toward just killing it for now (and maybe finding
> > another
> > way to check and warn about the same thing or one of the effects it
> > produces, later).
> > 
> > Meng, what do you think?
> I'm thinking about if it may not be worthwhile *for now only* to
> provide such information with so much effort and the danger of
> introducing more serious issues.
> 
Yeah, exactly what I was also saying.

> Right, race condition occurs on the global variable and I believe we
> don't want to encounter this race condition.
> So let's just not use the global variable.
> 
Well, it would just affect the logging itself. But then, why clobber
the (clarity of the) code for introducing proper per-domain logging,
and ending up with something that may not actually be per-domain..
that's how I was thinking at it.

> We should definitely put a large warning in the wiki for the RTDS
> scheduler about the parameter settings. Incorrect setting should
> never
> crash system but may lead to poor real-time performance users want.
> 
For example, one way of dealing with this would be to allow the
user/sysadmin to set what the minimum budget and period they want to be
able to use would be, by means of SYSCTLs.

We kind of like have a static and a dynamic limit. The former,
hardcoded in Xen, to something that we know is unreasonable on any
hardware platform available at that time. The latter, we can set to a
rather conservative high value by default, but let users that wants to
lower it, do that... and there is where we can print all the warnings
we want!

I'm just tossing this out there, though, and this is certainly
something that can come as future work.

Regards,
Dario
Dario Faggioli March 16, 2016, 2:54 p.m. UTC | #30
On Wed, 2016-03-16 at 09:46 -0500, Chong Li wrote:
> On Wed, Mar 16, 2016 at 3:23 AM, Dario Faggioli
> <dario.faggioli@citrix.com> wrote:
> > 
> > I continue to think that it could be useful to have this logged,
> > but
> > I'm leaning toward just killing it for now (and maybe finding
> > another
> > way to check and warn about the same thing or one of the effects it
> > produces, later).
> > 
> By "killing it", do you mean we don't do this check nor print the
> warning? Or just
> print the warning globally once?
> 
Remove the warning (and, of course, the check as well! :-)).

Dario
diff mbox

Patch

diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
index 0dce790..455c684 100644
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -1054,6 +1054,10 @@  csched_dom_cntl(
      * lock. Runq lock not needed anywhere in here. */
     spin_lock_irqsave(&prv->lock, flags);
 
+    if ( op->cmd == XEN_DOMCTL_SCHEDOP_putvcpuinfo ||
+         op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo )
+        return -EINVAL;
+
     if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
     {
         op->u.credit.weight = sdom->weight;
diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index 3c49ffa..c3049a0 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -1421,6 +1421,10 @@  csched2_dom_cntl(
      * runq lock to update csvcs. */
     spin_lock_irqsave(&prv->lock, flags);
 
+    if ( op->cmd == XEN_DOMCTL_SCHEDOP_putvcpuinfo ||
+         op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo )
+        return -EINVAL;
+
     if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
     {
         op->u.credit2.weight = sdom->weight;
diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
index 3f1d047..4fcbf40 100644
--- a/xen/common/sched_rt.c
+++ b/xen/common/sched_rt.c
@@ -86,6 +86,22 @@ 
 #define RTDS_DEFAULT_PERIOD     (MICROSECS(10000))
 #define RTDS_DEFAULT_BUDGET     (MICROSECS(4000))
 
+/*
+ * Max period: max delta of time type, because period is added to the time
+ * a vcpu activates, so this must not overflow.
+ * Min period: 10 us, considering the scheduling overhead (when period is
+ * too low, scheduling is invoked too frequently, causing high overhead).
+ */
+#define RTDS_MAX_PERIOD     (STIME_DELTA_MAX)
+#define RTDS_MIN_PERIOD     (MICROSECS(10))
+
+/*
+ * Min budget: 10 us, considering the scheduling overhead (when budget is
+ * consumed too fast, scheduling is invoked too frequently, causing
+ * high overhead).
+ */
+#define RTDS_MIN_BUDGET     (MICROSECS(10))
+
 #define UPDATE_LIMIT_SHIFT      10
 #define MAX_SCHEDULE            (MILLISECS(1))
 /*
@@ -1130,23 +1146,17 @@  rt_dom_cntl(
     unsigned long flags;
     int rc = 0;
 
+    xen_domctl_schedparam_vcpu_t local_sched;
+    s_time_t period, budget;
+    uint32_t index = 0;
+
     switch ( op->cmd )
     {
-    case XEN_DOMCTL_SCHEDOP_getinfo:
-        if ( d->max_vcpus > 0 )
-        {
-            spin_lock_irqsave(&prv->lock, flags);
-            svc = rt_vcpu(d->vcpu[0]);
-            op->u.rtds.period = svc->period / MICROSECS(1);
-            op->u.rtds.budget = svc->budget / MICROSECS(1);
-            spin_unlock_irqrestore(&prv->lock, flags);
-        }
-        else
-        {
-            /* If we don't have vcpus yet, let's just return the defaults. */
-            op->u.rtds.period = RTDS_DEFAULT_PERIOD;
-            op->u.rtds.budget = RTDS_DEFAULT_BUDGET;
-        }
+    case XEN_DOMCTL_SCHEDOP_getinfo: /* return the default parameters */
+        spin_lock_irqsave(&prv->lock, flags);
+        op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1);
+        op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1);
+        spin_unlock_irqrestore(&prv->lock, flags);
         break;
     case XEN_DOMCTL_SCHEDOP_putinfo:
         if ( op->u.rtds.period == 0 || op->u.rtds.budget == 0 )
@@ -1163,6 +1173,96 @@  rt_dom_cntl(
         }
         spin_unlock_irqrestore(&prv->lock, flags);
         break;
+    case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
+        if ( guest_handle_is_null(op->u.v.vcpus) )
+        {
+            rc = -EINVAL;
+            break;
+        }
+        while ( index < op->u.v.nr_vcpus )
+        {
+            if ( copy_from_guest_offset(&local_sched,
+                          op->u.v.vcpus, index, 1) )
+            {
+                rc = -EFAULT;
+                break;
+            }
+            if ( local_sched.vcpuid >= d->max_vcpus ||
+                          d->vcpu[local_sched.vcpuid] == NULL )
+            {
+                rc = -EINVAL;
+                break;
+            }
+
+            spin_lock_irqsave(&prv->lock, flags);
+            svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
+            local_sched.s.rtds.budget = svc->budget / MICROSECS(1);
+            local_sched.s.rtds.period = svc->period / MICROSECS(1);
+            spin_unlock_irqrestore(&prv->lock, flags);
+
+            if ( __copy_to_guest_offset(op->u.v.vcpus, index,
+                    &local_sched, 1) )
+            {
+                rc = -EFAULT;
+                break;
+            }
+            if ( (++index > 0x3f) && hypercall_preempt_check() )
+                break;
+        }
+
+        if ( !rc && (op->u.v.nr_vcpus != index) )
+            op->u.v.nr_vcpus = index;
+        break;
+    case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
+        if ( guest_handle_is_null(op->u.v.vcpus) )
+        {
+            rc = -EINVAL;
+            break;
+        }
+        while ( index < op->u.v.nr_vcpus )
+        {
+            if ( copy_from_guest_offset(&local_sched,
+                          op->u.v.vcpus, index, 1) )
+            {
+                rc = -EFAULT;
+                break;
+            }
+            if ( local_sched.vcpuid >= d->max_vcpus ||
+                          d->vcpu[local_sched.vcpuid] == NULL )
+            {
+                rc = -EINVAL;
+                break;
+            }
+
+            period = MICROSECS(local_sched.s.rtds.period);
+            budget = MICROSECS(local_sched.s.rtds.budget);
+            if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET ||
+                          budget > period || period < RTDS_MIN_PERIOD )
+            {
+                rc = -EINVAL;
+                break;
+            }
+
+            /*
+             * We accept period/budget less than 100 us, but will warn users about
+             * the large scheduling overhead due to it
+             */
+            if ( period < MICROSECS(100) || budget < MICROSECS(100) )
+                printk("Warning: period or budget set to less than 100us.\n"
+                       "This may result in high scheduling overhead.\n");
+
+            spin_lock_irqsave(&prv->lock, flags);
+            svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
+            svc->period = period;
+            svc->budget = budget;
+            spin_unlock_irqrestore(&prv->lock, flags);
+
+            if ( (++index > 0x3f) && hypercall_preempt_check() )
+                break;
+        }
+        if ( !rc && (op->u.v.nr_vcpus != index) )
+            op->u.v.nr_vcpus = index;
+        break;
     }
 
     return rc;
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index c195129..f4a4032 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -1148,10 +1148,19 @@  long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
     if ( ret )
         return ret;
 
-    if ( (op->sched_id != DOM2OP(d)->sched_id) ||
-         ((op->cmd != XEN_DOMCTL_SCHEDOP_putinfo) &&
-          (op->cmd != XEN_DOMCTL_SCHEDOP_getinfo)) )
+    if ( op->sched_id != DOM2OP(d)->sched_id )
         return -EINVAL;
+    else
+        switch ( op->cmd )
+        {
+        case XEN_DOMCTL_SCHEDOP_putinfo:
+        case XEN_DOMCTL_SCHEDOP_getinfo:
+        case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
+        case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
+            break;
+        default:
+            return -EINVAL;
+        }
 
     /* NB: the pluggable scheduler code needs to take care
      * of locking by itself. */
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 7a56b3f..b9975f6 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -338,24 +338,59 @@  DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_vcpus_t);
 #define XEN_SCHEDULER_ARINC653 7
 #define XEN_SCHEDULER_RTDS     8
 
-/* Set or get info? */
+typedef struct xen_domctl_sched_credit {
+    uint16_t weight;
+    uint16_t cap;
+} xen_domctl_sched_credit_t;
+
+typedef struct xen_domctl_sched_credit2 {
+    uint16_t weight;
+} xen_domctl_sched_credit2_t;
+
+typedef struct xen_domctl_sched_rtds {
+    uint32_t period;
+    uint32_t budget;
+} xen_domctl_sched_rtds_t;
+
+typedef struct xen_domctl_schedparam_vcpu {
+    union {
+        xen_domctl_sched_credit_t credit;
+        xen_domctl_sched_credit2_t credit2;
+        xen_domctl_sched_rtds_t rtds;
+    } s;
+    uint16_t vcpuid;
+    uint16_t padding[3];
+} xen_domctl_schedparam_vcpu_t;
+DEFINE_XEN_GUEST_HANDLE(xen_domctl_schedparam_vcpu_t);
+
+/*
+ * Set or get info?
+ * For schedulers supporting per-vcpu settings (e.g., RTDS):
+ *  XEN_DOMCTL_SCHEDOP_putinfo sets params for all vcpus;
+ *  XEN_DOMCTL_SCHEDOP_getinfo gets default params;
+ *  XEN_DOMCTL_SCHEDOP_put(get)vcpuinfo sets (gets) params of vcpus;
+ *
+ * For schedulers not supporting per-vcpu settings:
+ *  XEN_DOMCTL_SCHEDOP_putinfo sets params for all vcpus;
+ *  XEN_DOMCTL_SCHEDOP_getinfo gets domain-wise params;
+ *  XEN_DOMCTL_SCHEDOP_put(get)vcpuinfo returns error;
+ */
 #define XEN_DOMCTL_SCHEDOP_putinfo 0
 #define XEN_DOMCTL_SCHEDOP_getinfo 1
+#define XEN_DOMCTL_SCHEDOP_putvcpuinfo 2
+#define XEN_DOMCTL_SCHEDOP_getvcpuinfo 3
 struct xen_domctl_scheduler_op {
     uint32_t sched_id;  /* XEN_SCHEDULER_* */
     uint32_t cmd;       /* XEN_DOMCTL_SCHEDOP_* */
     union {
-        struct xen_domctl_sched_credit {
-            uint16_t weight;
-            uint16_t cap;
-        } credit;
-        struct xen_domctl_sched_credit2 {
-            uint16_t weight;
-        } credit2;
-        struct xen_domctl_sched_rtds {
-            uint32_t period;
-            uint32_t budget;
-        } rtds;
+        xen_domctl_sched_credit_t credit;
+        xen_domctl_sched_credit2_t credit2;
+        xen_domctl_sched_rtds_t rtds;
+        struct {
+            XEN_GUEST_HANDLE_64(xen_domctl_schedparam_vcpu_t) vcpus;
+            uint32_t nr_vcpus;
+            uint32_t padding;
+        } v;
     } u;
 };
 typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t;