diff mbox

xen: add hypercall option to temporarily pin a vcpu

Message ID 1456419000-390-1-git-send-email-jgross@suse.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jürgen Groß Feb. 25, 2016, 4:50 p.m. UTC
Some hardware (e.g. Dell studio 1555 laptops) require SMIs to be
called on physical cpu 0 only. Linux drivers like dcdbas or i8k try
to achieve this by pinning the running thread to cpu 0, but in Dom0
this is not enough: the vcpu must be pinned to physical cpu 0 via
Xen, too.

Add a stable hypercall option SCHEDOP_pin_temp to the sched_op
hypercall to achieve this. It is taking a physical cpu number as
parameter. If pinning is possible (the calling domain has the
privilege to make the call and the cpu is available in the domain's
cpupool) the calling vcpu is pinned to the specified cpu. The old
cpu affinity is saved. To undo the temporary pinning a cpu -1 is
specified. This will restore the original cpu affinity for the vcpu.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 xen/common/schedule.c      | 98 +++++++++++++++++++++++++++++++++++++++++++---
 xen/include/public/sched.h | 15 +++++++
 xen/include/xsm/dummy.h    |  6 +++
 xen/include/xsm/xsm.h      |  6 +++
 xen/xsm/dummy.c            |  1 +
 xen/xsm/flask/hooks.c      |  7 ++++
 6 files changed, 127 insertions(+), 6 deletions(-)

Comments

Jan Beulich Feb. 26, 2016, 10:39 a.m. UTC | #1
>>> On 25.02.16 at 17:50, <JGross@suse.com> wrote:
> @@ -670,7 +676,13 @@ int cpu_disable_scheduler(unsigned int cpu)
>              if ( cpumask_empty(&online_affinity) &&
>                   cpumask_test_cpu(cpu, v->cpu_hard_affinity) )
>              {
> -                printk(XENLOG_DEBUG "Breaking affinity for %pv\n", v);
> +                if ( v->affinity_broken )
> +                {
> +                    /* The vcpu is temporarily pinned, can't move it. */
> +                    vcpu_schedule_unlock_irqrestore(lock, flags, v);
> +                    ret = -EBUSY;
> +                    continue;
> +                }

So far the function can only return 0 or -EAGAIN. By using "continue"
here you will make it impossible for the caller to reliably determine
whether possibly both things failed. Despite -EBUSY being a logical
choice here, I think you'd better use -EAGAIN here too. And it needs
to be determined whether continuing the loop in this as well as the
pre-existing cases is actually the right thing to do.

> @@ -679,6 +691,8 @@ int cpu_disable_scheduler(unsigned int cpu)
>                      v->affinity_broken = 1;
>                  }
>  
> +                printk(XENLOG_DEBUG "Breaking affinity for %pv\n", v);

Wouldn't it be even better to make this the "else" to the
preceding if(), since in the suspend case this is otherwise going
to be printed for every vCPU not currently running on pCPU0?

> @@ -753,14 +767,22 @@ static int vcpu_set_affinity(
>      struct vcpu *v, const cpumask_t *affinity, cpumask_t *which)
>  {
>      spinlock_t *lock;
> +    int ret = 0;
>  
>      lock = vcpu_schedule_lock_irq(v);
>  
> -    cpumask_copy(which, affinity);
> +    if ( v->affinity_broken )
> +    {
> +        ret = -EBUSY;
> +    }

Unnecessary braces.

> @@ -979,6 +1001,53 @@ void watchdog_domain_destroy(struct domain *d)
>          kill_timer(&d->watchdog_timer[i]);
>  }
>  
> +static long do_pin_temp(int cpu)
> +{
> +    struct vcpu *v = current;
> +    spinlock_t *lock;
> +    long ret = -EINVAL;
> +
> +    lock = vcpu_schedule_lock_irq(v);
> +
> +    if ( cpu == -1 )
> +    {
> +        if ( v->affinity_broken )
> +        {
> +            cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved);
> +            v->affinity_broken = 0;
> +            set_bit(_VPF_migrating, &v->pause_flags);
> +            ret = 0;
> +        }
> +    }
> +    else if ( cpu < nr_cpu_ids && cpu >= 0 )

Perhaps easier to simply use "cpu < 0" in the first if()?

> +    {
> +        if ( v->affinity_broken )
> +        {
> +            ret = -EBUSY;
> +        }
> +        else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) )
> +        {

This is a rather ugly restriction: How would a caller fulfill its job
when this is not the case?

> @@ -1088,6 +1157,23 @@ ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
>          break;
>      }
>  
> +    case SCHEDOP_pin_temp:
> +    {
> +        struct sched_pin_temp sched_pin_temp;
> +
> +        ret = -EFAULT;
> +        if ( copy_from_guest(&sched_pin_temp, arg, 1) )
> +            break;
> +
> +        ret = xsm_schedop_pin_temp(XSM_PRIV);
> +        if ( ret )
> +            break;
> +
> +        ret = do_pin_temp(sched_pin_temp.pcpu);
> +
> +        break;
> +    }

So having come here I still don't see why this is called "temp":
Nothing enforces this to be a temporary state, and hence the
sub-op name currently is actively misleading.

> --- a/xen/include/public/sched.h
> +++ b/xen/include/public/sched.h
> @@ -118,6 +118,15 @@
>   * With id != 0 and timeout != 0, poke watchdog timer and set new timeout.
>   */
>  #define SCHEDOP_watchdog    6
> +
> +/*
> + * Temporarily pin the current vcpu to one physical cpu or undo that pinning.
> + * @arg == pointer to sched_pin_temp_t structure.
> + *
> + * Setting pcpu to -1 will undo a previous temporary pinning.
> + * This call is allowed for domains with domain control privilege only.
> + */

Why domain control privilege? I'd actually suggest limiting the
ability to the hardware domain, at once eliminating the need
for the XSM check.

> +struct sched_pin_temp {
> +    int pcpu;

Fixed width types only please in the public interface. Also this needs
an entry in xen/include/xlat.lst, and a consumer of the resulting
check macro.

Jan
Jürgen Groß Feb. 26, 2016, 11:14 a.m. UTC | #2
On 26/02/16 11:39, Jan Beulich wrote:
>>>> On 25.02.16 at 17:50, <JGross@suse.com> wrote:
>> @@ -670,7 +676,13 @@ int cpu_disable_scheduler(unsigned int cpu)
>>              if ( cpumask_empty(&online_affinity) &&
>>                   cpumask_test_cpu(cpu, v->cpu_hard_affinity) )
>>              {
>> -                printk(XENLOG_DEBUG "Breaking affinity for %pv\n", v);
>> +                if ( v->affinity_broken )
>> +                {
>> +                    /* The vcpu is temporarily pinned, can't move it. */
>> +                    vcpu_schedule_unlock_irqrestore(lock, flags, v);
>> +                    ret = -EBUSY;
>> +                    continue;
>> +                }
> 
> So far the function can only return 0 or -EAGAIN. By using "continue"
> here you will make it impossible for the caller to reliably determine
> whether possibly both things failed. Despite -EBUSY being a logical
> choice here, I think you'd better use -EAGAIN here too. And it needs
> to be determined whether continuing the loop in this as well as the
> pre-existing cases is actually the right thing to do.

EBUSY vs. EAGAIN: by returning EAGAIN I would signal to Xen tools that
the hypervisor is currently not able to do the desired operation
(especially removing a cpu from a cpupool), but the situation will
change automatically via scheduling. EBUSY will stop retries in Xen
tools and this is want I want here: I can't be sure the situation
will change soon.

Regarding continuation of the loop: I think you are right in the
EBUSY case: I should break out of the loop. I should not do so in the
EAGAIN case as I want to remove as many vcpus from the physical cpu as
possible without returning to the Xen tools in between.

> 
>> @@ -679,6 +691,8 @@ int cpu_disable_scheduler(unsigned int cpu)
>>                      v->affinity_broken = 1;
>>                  }
>>  
>> +                printk(XENLOG_DEBUG "Breaking affinity for %pv\n", v);
> 
> Wouldn't it be even better to make this the "else" to the
> preceding if(), since in the suspend case this is otherwise going
> to be printed for every vCPU not currently running on pCPU0?

Yes, I'll change it.

> 
>> @@ -753,14 +767,22 @@ static int vcpu_set_affinity(
>>      struct vcpu *v, const cpumask_t *affinity, cpumask_t *which)
>>  {
>>      spinlock_t *lock;
>> +    int ret = 0;
>>  
>>      lock = vcpu_schedule_lock_irq(v);
>>  
>> -    cpumask_copy(which, affinity);
>> +    if ( v->affinity_broken )
>> +    {
>> +        ret = -EBUSY;
>> +    }
> 
> Unnecessary braces.

Will remove.

> 
>> @@ -979,6 +1001,53 @@ void watchdog_domain_destroy(struct domain *d)
>>          kill_timer(&d->watchdog_timer[i]);
>>  }
>>  
>> +static long do_pin_temp(int cpu)
>> +{
>> +    struct vcpu *v = current;
>> +    spinlock_t *lock;
>> +    long ret = -EINVAL;
>> +
>> +    lock = vcpu_schedule_lock_irq(v);
>> +
>> +    if ( cpu == -1 )
>> +    {
>> +        if ( v->affinity_broken )
>> +        {
>> +            cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved);
>> +            v->affinity_broken = 0;
>> +            set_bit(_VPF_migrating, &v->pause_flags);
>> +            ret = 0;
>> +        }
>> +    }
>> +    else if ( cpu < nr_cpu_ids && cpu >= 0 )
> 
> Perhaps easier to simply use "cpu < 0" in the first if()?

Okay.

> 
>> +    {
>> +        if ( v->affinity_broken )
>> +        {
>> +            ret = -EBUSY;
>> +        }
>> +        else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) )
>> +        {
> 
> This is a rather ugly restriction: How would a caller fulfill its job
> when this is not the case?

He can't. We should document that at least on hardware requiring this
functionality it is a bad idea to remove cpu 0 from the cpupool with the
hardware domain.

> 
>> @@ -1088,6 +1157,23 @@ ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
>>          break;
>>      }
>>  
>> +    case SCHEDOP_pin_temp:
>> +    {
>> +        struct sched_pin_temp sched_pin_temp;
>> +
>> +        ret = -EFAULT;
>> +        if ( copy_from_guest(&sched_pin_temp, arg, 1) )
>> +            break;
>> +
>> +        ret = xsm_schedop_pin_temp(XSM_PRIV);
>> +        if ( ret )
>> +            break;
>> +
>> +        ret = do_pin_temp(sched_pin_temp.pcpu);
>> +
>> +        break;
>> +    }
> 
> So having come here I still don't see why this is called "temp":
> Nothing enforces this to be a temporary state, and hence the
> sub-op name currently is actively misleading.

I've chosen this name as the old affinity is saved and can (and should)
be recovered later. So it is intended to be temporary.

>> --- a/xen/include/public/sched.h
>> +++ b/xen/include/public/sched.h
>> @@ -118,6 +118,15 @@
>>   * With id != 0 and timeout != 0, poke watchdog timer and set new timeout.
>>   */
>>  #define SCHEDOP_watchdog    6
>> +
>> +/*
>> + * Temporarily pin the current vcpu to one physical cpu or undo that pinning.
>> + * @arg == pointer to sched_pin_temp_t structure.
>> + *
>> + * Setting pcpu to -1 will undo a previous temporary pinning.
>> + * This call is allowed for domains with domain control privilege only.
>> + */
> 
> Why domain control privilege? I'd actually suggest limiting the
> ability to the hardware domain, at once eliminating the need
> for the XSM check.

Sure, I'd be happy to simplify the patch.

> 
>> +struct sched_pin_temp {
>> +    int pcpu;
> 
> Fixed width types only please in the public interface. Also this needs
> an entry in xen/include/xlat.lst, and a consumer of the resulting
> check macro.

Aah, okay.

Thanks for the review,

Juergen
Dario Faggioli Feb. 26, 2016, 11:20 a.m. UTC | #3
On Fri, 2016-02-26 at 12:14 +0100, Juergen Gross wrote:
> On 26/02/16 11:39, Jan Beulich wrote:
> > 
> > > @@ -670,7 +676,13 @@ int cpu_disable_scheduler(unsigned int cpu)
> > >              if ( cpumask_empty(&online_affinity) &&
> > >                   cpumask_test_cpu(cpu, v->cpu_hard_affinity) )
> > >              {
> > > -                printk(XENLOG_DEBUG "Breaking affinity for
> > > %pv\n", v);
> > > +                if ( v->affinity_broken )
> > > +                {
> > > +                    /* The vcpu is temporarily pinned, can't
> > > move it. */
> > > +                    vcpu_schedule_unlock_irqrestore(lock, flags,
> > > v);
> > > +                    ret = -EBUSY;
> > > +                    continue;
> > > +                }
> > So far the function can only return 0 or -EAGAIN. By using
> > "continue"
> > here you will make it impossible for the caller to reliably
> > determine
> > whether possibly both things failed. Despite -EBUSY being a logical
> > choice here, I think you'd better use -EAGAIN here too. And it
> > needs
> > to be determined whether continuing the loop in this as well as the
> > pre-existing cases is actually the right thing to do.
> EBUSY vs. EAGAIN: by returning EAGAIN I would signal to Xen tools
> that
> the hypervisor is currently not able to do the desired operation
> (especially removing a cpu from a cpupool), but the situation will
> change automatically via scheduling. EBUSY will stop retries in Xen
> tools and this is want I want here: I can't be sure the situation
> will change soon.
> 
I agree with this.

> Regarding continuation of the loop: I think you are right in the
> EBUSY case: I should break out of the loop. I should not do so in the
> EAGAIN case as I want to remove as many vcpus from the physical cpu
> as
> possible without returning to the Xen tools in between.
> 
And with this too.

And I think that, if we indeed break out of the loop on EBUSY, that
will also make it possible to figure out properly what actually went
wrong, so it should be fine from that point of view as well.

> > > @@ -679,6 +691,8 @@ int cpu_disable_scheduler(unsigned int cpu)
> > >                      v->affinity_broken = 1;
> > >                  }
> > >  
> > > +                printk(XENLOG_DEBUG "Breaking affinity for
> > > %pv\n", v);
> > Wouldn't it be even better to make this the "else" to the
> > preceding if(), since in the suspend case this is otherwise going
> > to be printed for every vCPU not currently running on pCPU0?
> Yes, I'll change it.
> 
On this, can (either of) you elaborate a bit more? I don't think I'm
following...

Thanks and Regards,
Dario
Jürgen Groß Feb. 26, 2016, 11:43 a.m. UTC | #4
On 26/02/16 12:20, Dario Faggioli wrote:
> On Fri, 2016-02-26 at 12:14 +0100, Juergen Gross wrote:
>> On 26/02/16 11:39, Jan Beulich wrote:
>>>> @@ -679,6 +691,8 @@ int cpu_disable_scheduler(unsigned int cpu)
>>>>                      v->affinity_broken = 1;
>>>>                  }
>>>>  
>>>> +                printk(XENLOG_DEBUG "Breaking affinity for
>>>> %pv\n", v);
>>> Wouldn't it be even better to make this the "else" to the
>>> preceding if(), since in the suspend case this is otherwise going
>>> to be printed for every vCPU not currently running on pCPU0?
>> Yes, I'll change it.
>>
> On this, can (either of) you elaborate a bit more? I don't think I'm
> following...

In the suspend case the affinity will be broken only temporarily, so
there is no need to print the debug message.


juergen
Jan Beulich Feb. 26, 2016, 12:39 p.m. UTC | #5
>>> On 26.02.16 at 12:20, <dario.faggioli@citrix.com> wrote:
> On Fri, 2016-02-26 at 12:14 +0100, Juergen Gross wrote:
>> On 26/02/16 11:39, Jan Beulich wrote:
>> > 
>> > > @@ -670,7 +676,13 @@ int cpu_disable_scheduler(unsigned int cpu)
>> > >              if ( cpumask_empty(&online_affinity) &&
>> > >                   cpumask_test_cpu(cpu, v->cpu_hard_affinity) )
>> > >              {
>> > > -                printk(XENLOG_DEBUG "Breaking affinity for
>> > > %pv\n", v);
>> > > +                if ( v->affinity_broken )
>> > > +                {
>> > > +                    /* The vcpu is temporarily pinned, can't
>> > > move it. */
>> > > +                    vcpu_schedule_unlock_irqrestore(lock, flags,
>> > > v);
>> > > +                    ret = -EBUSY;
>> > > +                    continue;
>> > > +                }
>> > So far the function can only return 0 or -EAGAIN. By using
>> > "continue"
>> > here you will make it impossible for the caller to reliably
>> > determine
>> > whether possibly both things failed. Despite -EBUSY being a logical
>> > choice here, I think you'd better use -EAGAIN here too. And it
>> > needs
>> > to be determined whether continuing the loop in this as well as the
>> > pre-existing cases is actually the right thing to do.
>> EBUSY vs. EAGAIN: by returning EAGAIN I would signal to Xen tools
>> that
>> the hypervisor is currently not able to do the desired operation
>> (especially removing a cpu from a cpupool), but the situation will
>> change automatically via scheduling. EBUSY will stop retries in Xen
>> tools and this is want I want here: I can't be sure the situation
>> will change soon.
>> 
> I agree with this.

I'm of two minds here: I can see your viewpoint, but considering
this is called "temporarily pin a vcpu" the condition is supposed to
be going away again soon.

>> Regarding continuation of the loop: I think you are right in the
>> EBUSY case: I should break out of the loop. I should not do so in the
>> EAGAIN case as I want to remove as many vcpus from the physical cpu
>> as
>> possible without returning to the Xen tools in between.
>> 
> And with this too.
> 
> And I think that, if we indeed break out of the loop on EBUSY, that
> will also make it possible to figure out properly what actually went
> wrong, so it should be fine from that point of view as well.

Yes indeed.

>> > > @@ -679,6 +691,8 @@ int cpu_disable_scheduler(unsigned int cpu)
>> > >                      v->affinity_broken = 1;
>> > >                  }
>> > >  
>> > > +                printk(XENLOG_DEBUG "Breaking affinity for
>> > > %pv\n", v);
>> > Wouldn't it be even better to make this the "else" to the
>> > preceding if(), since in the suspend case this is otherwise going
>> > to be printed for every vCPU not currently running on pCPU0?
>> Yes, I'll change it.
>> 
> On this, can (either of) you elaborate a bit more? I don't think I'm
> following...

In addition to Jürgen's reply: My main concern here is that on
a bug system this message would get printed for almost every
vCPU in the system, which could end up being a lot of noise.

And there's a similar message on the resume side I think -
perhaps that one should be silenced too.

Jan
Jürgen Groß Feb. 26, 2016, 12:49 p.m. UTC | #6
On 26/02/16 13:39, Jan Beulich wrote:
>>>> On 26.02.16 at 12:20, <dario.faggioli@citrix.com> wrote:
>> On Fri, 2016-02-26 at 12:14 +0100, Juergen Gross wrote:
>>> On 26/02/16 11:39, Jan Beulich wrote:
>>>>
>>>>> @@ -670,7 +676,13 @@ int cpu_disable_scheduler(unsigned int cpu)
>>>>>              if ( cpumask_empty(&online_affinity) &&
>>>>>                   cpumask_test_cpu(cpu, v->cpu_hard_affinity) )
>>>>>              {
>>>>> -                printk(XENLOG_DEBUG "Breaking affinity for
>>>>> %pv\n", v);
>>>>> +                if ( v->affinity_broken )
>>>>> +                {
>>>>> +                    /* The vcpu is temporarily pinned, can't
>>>>> move it. */
>>>>> +                    vcpu_schedule_unlock_irqrestore(lock, flags,
>>>>> v);
>>>>> +                    ret = -EBUSY;
>>>>> +                    continue;
>>>>> +                }
>>>> So far the function can only return 0 or -EAGAIN. By using
>>>> "continue"
>>>> here you will make it impossible for the caller to reliably
>>>> determine
>>>> whether possibly both things failed. Despite -EBUSY being a logical
>>>> choice here, I think you'd better use -EAGAIN here too. And it
>>>> needs
>>>> to be determined whether continuing the loop in this as well as the
>>>> pre-existing cases is actually the right thing to do.
>>> EBUSY vs. EAGAIN: by returning EAGAIN I would signal to Xen tools
>>> that
>>> the hypervisor is currently not able to do the desired operation
>>> (especially removing a cpu from a cpupool), but the situation will
>>> change automatically via scheduling. EBUSY will stop retries in Xen
>>> tools and this is want I want here: I can't be sure the situation
>>> will change soon.
>>>
>> I agree with this.
> 
> I'm of two minds here: I can see your viewpoint, but considering
> this is called "temporarily pin a vcpu" the condition is supposed to
> be going away again soon.

It is supposed to do so, yes. But the hypervisor can't make sure it
will, as it requires an appropriate hypercall by the hardware domain.
In the cpupool case no domain is capable to make the situation
persist.

Would you be fine with adding a tools patch doing a limited number
of retries in the EBUSY case (maybe with sleeping 1 second in that
case)?

> 
>>> Regarding continuation of the loop: I think you are right in the
>>> EBUSY case: I should break out of the loop. I should not do so in the
>>> EAGAIN case as I want to remove as many vcpus from the physical cpu
>>> as
>>> possible without returning to the Xen tools in between.
>>>
>> And with this too.
>>
>> And I think that, if we indeed break out of the loop on EBUSY, that
>> will also make it possible to figure out properly what actually went
>> wrong, so it should be fine from that point of view as well.
> 
> Yes indeed.
> 
>>>>> @@ -679,6 +691,8 @@ int cpu_disable_scheduler(unsigned int cpu)
>>>>>                      v->affinity_broken = 1;
>>>>>                  }
>>>>>  
>>>>> +                printk(XENLOG_DEBUG "Breaking affinity for
>>>>> %pv\n", v);
>>>> Wouldn't it be even better to make this the "else" to the
>>>> preceding if(), since in the suspend case this is otherwise going
>>>> to be printed for every vCPU not currently running on pCPU0?
>>> Yes, I'll change it.
>>>
>> On this, can (either of) you elaborate a bit more? I don't think I'm
>> following...
> 
> In addition to Jürgen's reply: My main concern here is that on
> a bug system this message would get printed for almost every
> vCPU in the system, which could end up being a lot of noise.
> 
> And there's a similar message on the resume side I think -
> perhaps that one should be silenced too.

Okay. I'll do the silencing (both cases) in an extra patch.


Juergen
Dario Faggioli Feb. 26, 2016, 1:07 p.m. UTC | #7
On Fri, 2016-02-26 at 05:39 -0700, Jan Beulich wrote:
> > 
> > > > On 26.02.16 at 12:20, <dario.faggioli@citrix.com> wrote:
> > On Fri, 2016-02-26 at 12:14 +0100, Juergen Gross wrote:
> > > 
> > > EBUSY vs. EAGAIN: by returning EAGAIN I would signal to Xen tools
> > > that
> > > the hypervisor is currently not able to do the desired operation
> > > (especially removing a cpu from a cpupool), but the situation
> > > will
> > > change automatically via scheduling. EBUSY will stop retries in
> > > Xen
> > > tools and this is want I want here: I can't be sure the situation
> > > will change soon.
> > > 
> > I agree with this.
> I'm of two minds here: I can see your viewpoint, but considering
> this is called "temporarily pin a vcpu" the condition is supposed to
> be going away again soon.
> 
Maybe one difference is that it won't go away "by itself". I.e., just
retrying in a while, especially if from inside Xen, and without anyone
explicitly calling the hypercall again with proper argument, nothing
will change.

> > > > Wouldn't it be even better to make this the "else" to the
> > > > preceding if(), since in the suspend case this is otherwise
> > > > going
> > > > to be printed for every vCPU not currently running on pCPU0?
> > > Yes, I'll change it.
> > > 
> > On this, can (either of) you elaborate a bit more? I don't think
> > I'm
> > following...
> In addition to Jürgen's reply: My main concern here is that on
> a bug system this message would get printed for almost every
> vCPU in the system, which could end up being a lot of noise.
> 
> And there's a similar message on the resume side I think -
> perhaps that one should be silenced too.
> 
What I don't understand is this part of your first comment "in the
suspend case this is otherwise going to be printed for every vCPU not
currently running on pCPU0".

First, do you mean with Juergen's patch, or even right now?

And anyway, this is going to be printed for all the vCPUs that does not
have, in their hard affinity, any of the pCPUs that are going to remain
online (or to remain in the domain's cpupool).

In shutdown and suspend, when we try to move everything to pCPU 0, it
will get printed for all the vCPUs that does not have pCPU 0 in their
hard affinity.

We can argue about that being useful or not, and about it being
(potentially) too noisy or not. I personally think it could be useful
(it's XENLOG_DEBUG, after all), but I won't oppose getting rid of it...
I am just not getting why you're saying "not currently running on
pCPU0".

Thanks and Regards,
Dario
Jan Beulich Feb. 26, 2016, 1:32 p.m. UTC | #8
>>> On 26.02.16 at 14:07, <dario.faggioli@citrix.com> wrote:
> On Fri, 2016-02-26 at 05:39 -0700, Jan Beulich wrote:
>> > > > On 26.02.16 at 12:20, <dario.faggioli@citrix.com> wrote:
>> > On Fri, 2016-02-26 at 12:14 +0100, Juergen Gross wrote:
>> > > > Wouldn't it be even better to make this the "else" to the
>> > > > preceding if(), since in the suspend case this is otherwise
>> > > > going
>> > > > to be printed for every vCPU not currently running on pCPU0?
>> > > Yes, I'll change it.
>> > > 
>> > On this, can (either of) you elaborate a bit more? I don't think
>> > I'm
>> > following...
>> In addition to Jürgen's reply: My main concern here is that on
>> a bug system this message would get printed for almost every
>> vCPU in the system, which could end up being a lot of noise.
>> 
>> And there's a similar message on the resume side I think -
>> perhaps that one should be silenced too.
>> 
> What I don't understand is this part of your first comment "in the
> suspend case this is otherwise going to be printed for every vCPU not
> currently running on pCPU0".
> 
> First, do you mean with Juergen's patch, or even right now?

Even right now. Just that his patch made this pretty obvious.

> And anyway, this is going to be printed for all the vCPUs that does not
> have, in their hard affinity, any of the pCPUs that are going to remain
> online (or to remain in the domain's cpupool).
> 
> In shutdown and suspend, when we try to move everything to pCPU 0, it
> will get printed for all the vCPUs that does not have pCPU 0 in their
> hard affinity.
> 
> We can argue about that being useful or not, and about it being
> (potentially) too noisy or not. I personally think it could be useful
> (it's XENLOG_DEBUG, after all), but I won't oppose getting rid of it...
> I am just not getting why you're saying "not currently running on
> pCPU0".

Oh, you're right, that was too strict - "not being allowed to run
on CPU0" would be the right description. And indeed that makes
it look not as noisy (but too much, since during suspend this is
what one has to expect would happen).

Jan
Jan Beulich Feb. 26, 2016, 1:34 p.m. UTC | #9
>>> On 26.02.16 at 13:49, <JGross@suse.com> wrote:
> On 26/02/16 13:39, Jan Beulich wrote:
>>>>> On 26.02.16 at 12:20, <dario.faggioli@citrix.com> wrote:
>>> On Fri, 2016-02-26 at 12:14 +0100, Juergen Gross wrote:
>>>> On 26/02/16 11:39, Jan Beulich wrote:
>>>>>
>>>>>> @@ -670,7 +676,13 @@ int cpu_disable_scheduler(unsigned int cpu)
>>>>>>              if ( cpumask_empty(&online_affinity) &&
>>>>>>                   cpumask_test_cpu(cpu, v->cpu_hard_affinity) )
>>>>>>              {
>>>>>> -                printk(XENLOG_DEBUG "Breaking affinity for
>>>>>> %pv\n", v);
>>>>>> +                if ( v->affinity_broken )
>>>>>> +                {
>>>>>> +                    /* The vcpu is temporarily pinned, can't
>>>>>> move it. */
>>>>>> +                    vcpu_schedule_unlock_irqrestore(lock, flags,
>>>>>> v);
>>>>>> +                    ret = -EBUSY;
>>>>>> +                    continue;
>>>>>> +                }
>>>>> So far the function can only return 0 or -EAGAIN. By using
>>>>> "continue"
>>>>> here you will make it impossible for the caller to reliably
>>>>> determine
>>>>> whether possibly both things failed. Despite -EBUSY being a logical
>>>>> choice here, I think you'd better use -EAGAIN here too. And it
>>>>> needs
>>>>> to be determined whether continuing the loop in this as well as the
>>>>> pre-existing cases is actually the right thing to do.
>>>> EBUSY vs. EAGAIN: by returning EAGAIN I would signal to Xen tools
>>>> that
>>>> the hypervisor is currently not able to do the desired operation
>>>> (especially removing a cpu from a cpupool), but the situation will
>>>> change automatically via scheduling. EBUSY will stop retries in Xen
>>>> tools and this is want I want here: I can't be sure the situation
>>>> will change soon.
>>>>
>>> I agree with this.
>> 
>> I'm of two minds here: I can see your viewpoint, but considering
>> this is called "temporarily pin a vcpu" the condition is supposed to
>> be going away again soon.
> 
> It is supposed to do so, yes. But the hypervisor can't make sure it
> will, as it requires an appropriate hypercall by the hardware domain.
> In the cpupool case no domain is capable to make the situation
> persist.
> 
> Would you be fine with adding a tools patch doing a limited number
> of retries in the EBUSY case (maybe with sleeping 1 second in that
> case)?

That would make me worry less, yes.

Jan
Dario Faggioli Feb. 26, 2016, 1:39 p.m. UTC | #10
On Fri, 2016-02-26 at 06:32 -0700, Jan Beulich wrote:
> > > > On 26.02.16 at 14:07, <dario.faggioli@citrix.com> wrote:
> > We can argue about that being useful or not, and about it being
> > (potentially) too noisy or not. I personally think it could be
> > useful
> > (it's XENLOG_DEBUG, after all), but I won't oppose getting rid of
> > it...
> > I am just not getting why you're saying "not currently running on
> > pCPU0".
> Oh, you're right, that was too strict - "not being allowed to run
> on CPU0" would be the right description. And indeed that makes
> it look not as noisy (but too much, since during suspend this is
> what one has to expect would happen).
>
Yes, mostly on the ground that it's the intended behavior and (in that
case really) temporary, I'm indeed ok silencing this on suspend.

Since it's pretty independent, I'd prefer this to be done in a separate
patch, together with the resume side, as (AFAIUI) Juergen is planning
to do already.

Dario
diff mbox

Patch

diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 434dcfc..ddb5989 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -271,6 +271,12 @@  int sched_move_domain(struct domain *d, struct cpupool *c)
     struct scheduler *old_ops;
     void *old_domdata;
 
+    for_each_vcpu ( d, v )
+    {
+        if ( v->affinity_broken )
+            return -EBUSY;
+    }
+
     domdata = SCHED_OP(c->sched, alloc_domdata, d);
     if ( domdata == NULL )
         return -ENOMEM;
@@ -670,7 +676,13 @@  int cpu_disable_scheduler(unsigned int cpu)
             if ( cpumask_empty(&online_affinity) &&
                  cpumask_test_cpu(cpu, v->cpu_hard_affinity) )
             {
-                printk(XENLOG_DEBUG "Breaking affinity for %pv\n", v);
+                if ( v->affinity_broken )
+                {
+                    /* The vcpu is temporarily pinned, can't move it. */
+                    vcpu_schedule_unlock_irqrestore(lock, flags, v);
+                    ret = -EBUSY;
+                    continue;
+                }
 
                 if (system_state == SYS_STATE_suspend)
                 {
@@ -679,6 +691,8 @@  int cpu_disable_scheduler(unsigned int cpu)
                     v->affinity_broken = 1;
                 }
 
+                printk(XENLOG_DEBUG "Breaking affinity for %pv\n", v);
+
                 cpumask_setall(v->cpu_hard_affinity);
             }
 
@@ -753,14 +767,22 @@  static int vcpu_set_affinity(
     struct vcpu *v, const cpumask_t *affinity, cpumask_t *which)
 {
     spinlock_t *lock;
+    int ret = 0;
 
     lock = vcpu_schedule_lock_irq(v);
 
-    cpumask_copy(which, affinity);
+    if ( v->affinity_broken )
+    {
+        ret = -EBUSY;
+    }
+    else
+    {
+        cpumask_copy(which, affinity);
 
-    /* Always ask the scheduler to re-evaluate placement
-     * when changing the affinity */
-    set_bit(_VPF_migrating, &v->pause_flags);
+        /* Always ask the scheduler to re-evaluate placement
+         * when changing the affinity */
+        set_bit(_VPF_migrating, &v->pause_flags);
+    }
 
     vcpu_schedule_unlock_irq(lock, v);
 
@@ -772,7 +794,7 @@  static int vcpu_set_affinity(
         vcpu_migrate(v);
     }
 
-    return 0;
+    return ret;
 }
 
 int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity)
@@ -979,6 +1001,53 @@  void watchdog_domain_destroy(struct domain *d)
         kill_timer(&d->watchdog_timer[i]);
 }
 
+static long do_pin_temp(int cpu)
+{
+    struct vcpu *v = current;
+    spinlock_t *lock;
+    long ret = -EINVAL;
+
+    lock = vcpu_schedule_lock_irq(v);
+
+    if ( cpu == -1 )
+    {
+        if ( v->affinity_broken )
+        {
+            cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved);
+            v->affinity_broken = 0;
+            set_bit(_VPF_migrating, &v->pause_flags);
+            ret = 0;
+        }
+    }
+    else if ( cpu < nr_cpu_ids && cpu >= 0 )
+    {
+        if ( v->affinity_broken )
+        {
+            ret = -EBUSY;
+        }
+        else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) )
+        {
+            cpumask_copy(v->cpu_hard_affinity_saved, v->cpu_hard_affinity);
+            v->affinity_broken = 1;
+            cpumask_copy(v->cpu_hard_affinity, cpumask_of(cpu));
+            set_bit(_VPF_migrating, &v->pause_flags);
+            ret = 0;
+        }
+    }
+
+    vcpu_schedule_unlock_irq(lock, v);
+
+    domain_update_node_affinity(v->domain);
+
+    if ( v->pause_flags & VPF_migrating )
+    {
+        vcpu_sleep_nosync(v);
+        vcpu_migrate(v);
+    }
+
+    return ret;
+}
+
 typedef long ret_t;
 
 #endif /* !COMPAT */
@@ -1088,6 +1157,23 @@  ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
         break;
     }
 
+    case SCHEDOP_pin_temp:
+    {
+        struct sched_pin_temp sched_pin_temp;
+
+        ret = -EFAULT;
+        if ( copy_from_guest(&sched_pin_temp, arg, 1) )
+            break;
+
+        ret = xsm_schedop_pin_temp(XSM_PRIV);
+        if ( ret )
+            break;
+
+        ret = do_pin_temp(sched_pin_temp.pcpu);
+
+        break;
+    }
+
     default:
         ret = -ENOSYS;
     }
diff --git a/xen/include/public/sched.h b/xen/include/public/sched.h
index 2219696..acc18d5 100644
--- a/xen/include/public/sched.h
+++ b/xen/include/public/sched.h
@@ -118,6 +118,15 @@ 
  * With id != 0 and timeout != 0, poke watchdog timer and set new timeout.
  */
 #define SCHEDOP_watchdog    6
+
+/*
+ * Temporarily pin the current vcpu to one physical cpu or undo that pinning.
+ * @arg == pointer to sched_pin_temp_t structure.
+ *
+ * Setting pcpu to -1 will undo a previous temporary pinning.
+ * This call is allowed for domains with domain control privilege only.
+ */
+#define SCHEDOP_pin_temp    7
 /* ` } */
 
 struct sched_shutdown {
@@ -148,6 +157,12 @@  struct sched_watchdog {
 typedef struct sched_watchdog sched_watchdog_t;
 DEFINE_XEN_GUEST_HANDLE(sched_watchdog_t);
 
+struct sched_pin_temp {
+    int pcpu;
+};
+typedef struct sched_pin_temp sched_pin_temp_t;
+DEFINE_XEN_GUEST_HANDLE(sched_pin_temp_t);
+
 /*
  * Reason codes for SCHEDOP_shutdown. These may be interpreted by control
  * software to determine the appropriate action. For the most part, Xen does
diff --git a/xen/include/xsm/dummy.h b/xen/include/xsm/dummy.h
index 1d13826..730e112 100644
--- a/xen/include/xsm/dummy.h
+++ b/xen/include/xsm/dummy.h
@@ -240,6 +240,12 @@  static XSM_INLINE int xsm_schedop_shutdown(XSM_DEFAULT_ARG struct domain *d1, st
     return xsm_default_action(action, d1, d2);
 }
 
+static XSM_INLINE int xsm_schedop_pin_temp(XSM_DEFAULT_VOID)
+{
+    XSM_ASSERT_ACTION(XSM_PRIV);
+    return xsm_default_action(action, current->domain, NULL);
+}
+
 static XSM_INLINE int xsm_memory_pin_page(XSM_DEFAULT_ARG struct domain *d1, struct domain *d2,
                                           struct page_info *page)
 {
diff --git a/xen/include/xsm/xsm.h b/xen/include/xsm/xsm.h
index 3afed70..ac6487f 100644
--- a/xen/include/xsm/xsm.h
+++ b/xen/include/xsm/xsm.h
@@ -102,6 +102,7 @@  struct xsm_operations {
 
     int (*kexec) (void);
     int (*schedop_shutdown) (struct domain *d1, struct domain *d2);
+    int (*schedop_pin_temp) (void);
 
     char *(*show_irq_sid) (int irq);
     int (*map_domain_pirq) (struct domain *d);
@@ -413,6 +414,11 @@  static inline int xsm_schedop_shutdown (xsm_default_t def, struct domain *d1, st
     return xsm_ops->schedop_shutdown(d1, d2);
 }
 
+static inline int xsm_schedop_pin_temp(xsm_default_t def)
+{
+    return xsm_ops->schedop_pin_temp();
+}
+
 static inline char *xsm_show_irq_sid (int irq)
 {
     return xsm_ops->show_irq_sid(irq);
diff --git a/xen/xsm/dummy.c b/xen/xsm/dummy.c
index 0f32636..2df1167 100644
--- a/xen/xsm/dummy.c
+++ b/xen/xsm/dummy.c
@@ -75,6 +75,7 @@  void xsm_fixup_ops (struct xsm_operations *ops)
 
     set_to_dummy_if_null(ops, kexec);
     set_to_dummy_if_null(ops, schedop_shutdown);
+    set_to_dummy_if_null(ops, schedop_pin_temp);
 
     set_to_dummy_if_null(ops, show_irq_sid);
     set_to_dummy_if_null(ops, map_domain_pirq);
diff --git a/xen/xsm/flask/hooks.c b/xen/xsm/flask/hooks.c
index 4813623..5cfbc30 100644
--- a/xen/xsm/flask/hooks.c
+++ b/xen/xsm/flask/hooks.c
@@ -470,6 +470,12 @@  static int flask_schedop_shutdown(struct domain *d1, struct domain *d2)
     return domain_has_perm(d1, d2, SECCLASS_DOMAIN, DOMAIN__SHUTDOWN);
 }
 
+static int flask_schedop_pin_temp(void)
+{
+    return domain_has_perm(current->domain, SECCLASS_DOMAIN,
+                           DOMAIN__SETAFFINITY);
+}
+
 static void flask_security_domaininfo(struct domain *d, 
                                       struct xen_domctl_getdomaininfo *info)
 {
@@ -1669,6 +1675,7 @@  static struct xsm_operations flask_ops = {
 
     .kexec = flask_kexec,
     .schedop_shutdown = flask_schedop_shutdown,
+    .schedop_pin_temp = flask_schedop_pin_temp,
 
     .show_irq_sid = flask_show_irq_sid,