diff mbox series

[v3,35/47] xen/sched: make vcpu_wake() and vcpu_sleep() core scheduling aware

Message ID 20190914085251.18816-36-jgross@suse.com (mailing list archive)
State Superseded
Headers show
Series xen: add core scheduling support | expand

Commit Message

Jürgen Groß Sept. 14, 2019, 8:52 a.m. UTC
vcpu_wake() and vcpu_sleep() need to be made core scheduling aware:
they might need to switch a single vcpu of an already scheduled unit
between running and not running.

Especially when vcpu_sleep() for a vcpu is being called by a vcpu of
the same scheduling unit special care must be taken in order to avoid
a deadlock: the vcpu to be put asleep must be forced through a
context switch without doing so for the calling vcpu. For this
purpose add a vcpu flag handled in sched_slave() and in
sched_wait_rendezvous_in() allowing a vcpu of the currently running
unit to switch state at a higher priority than a normal schedule
event.

Use the same mechanism when waking up a vcpu of a currently active
unit.

While at it make vcpu_sleep_nosync_locked() static as it is used in
schedule.c only.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
RFC V2: add vcpu_sleep() handling and force_context_switch flag
V2: fix runstate change in sched_force_context_switch()
---
 xen/common/schedule.c      | 125 ++++++++++++++++++++++++++++++++++++++++++---
 xen/include/xen/sched-if.h |   9 ++--
 xen/include/xen/sched.h    |   2 +
 3 files changed, 127 insertions(+), 9 deletions(-)

Comments

Jan Beulich Sept. 24, 2019, 11:55 a.m. UTC | #1
On 14.09.2019 10:52, Juergen Gross wrote:
> --- a/xen/common/schedule.c
> +++ b/xen/common/schedule.c
> @@ -724,8 +724,10 @@ void sched_destroy_domain(struct domain *d)
>      }
>  }
>  
> -void vcpu_sleep_nosync_locked(struct vcpu *v)
> +static void vcpu_sleep_nosync_locked(struct vcpu *v)
>  {
> +    struct sched_unit *unit = v->sched_unit;
> +
>      ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
>  
>      if ( likely(!vcpu_runnable(v)) )
> @@ -733,7 +735,14 @@ void vcpu_sleep_nosync_locked(struct vcpu *v)
>          if ( v->runstate.state == RUNSTATE_runnable )
>              vcpu_runstate_change(v, RUNSTATE_offline, NOW());
>  
> -        sched_sleep(vcpu_scheduler(v), v->sched_unit);
> +        if ( likely(!unit_runnable(unit)) )
> +            sched_sleep(vcpu_scheduler(v), unit);

unit_scheduler(unit) (also elsewhere)?

> @@ -765,16 +774,22 @@ void vcpu_wake(struct vcpu *v)
>  {
>      unsigned long flags;
>      spinlock_t *lock;
> +    struct sched_unit *unit = v->sched_unit;
>  
>      TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
>  
> -    lock = unit_schedule_lock_irqsave(v->sched_unit, &flags);
> +    lock = unit_schedule_lock_irqsave(unit, &flags);
>  
>      if ( likely(vcpu_runnable(v)) )
>      {
>          if ( v->runstate.state >= RUNSTATE_blocked )
>              vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
> -        sched_wake(vcpu_scheduler(v), v->sched_unit);
> +        sched_wake(vcpu_scheduler(v), unit);

Is this correct / necessary when the unit is not asleep as a whole?
After all the corresponding sched_sleep() further up is called
conditionally only.

> @@ -1998,6 +2013,62 @@ static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext,
>      context_switch(vprev, vnext);
>  }
>  
> +/*
> + * Force a context switch of a single vcpu of an unit.
> + * Might be called either if a vcpu of an already running unit is woken up
> + * or if a vcpu of a running unit is put asleep with other vcpus of the same
> + * unit still running.
> + */
> +static struct vcpu *sched_force_context_switch(struct vcpu *vprev,
> +                                               struct vcpu *v,
> +                                               int cpu, s_time_t now)

unsigned int cpu? (Aiui it's suppose to equal smp_processor_id()
anyway.)

> +{
> +    v->force_context_switch = false;
> +
> +    if ( vcpu_runnable(v) == v->is_running )
> +        return NULL;

This and other NULL returns suggest that the comment ahead of the
function might better state what the return value here is / means.

> +    if ( vcpu_runnable(v) )
> +    {
> +        if ( is_idle_vcpu(vprev) )
> +        {
> +            vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
> +            vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
> +        }
> +        vcpu_runstate_change(v, RUNSTATE_running, now);
> +    }
> +    else
> +    {
> +        /* Make sure not to switch last vcpu of an unit away. */
> +        if ( unit_running(v->sched_unit) == 1 )
> +            return NULL;
> +
> +        v->new_state = vcpu_runstate_blocked(v);
> +        vcpu_runstate_change(v, v->new_state, now);
> +        v = sched_unit2vcpu_cpu(vprev->sched_unit, cpu);
> +        if ( v != vprev )
> +        {
> +            if ( is_idle_vcpu(vprev) )
> +            {
> +                vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
> +                vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
> +            }
> +            else
> +            {
> +                v->sched_unit = vprev->sched_unit;
> +                vcpu_runstate_change(v, RUNSTATE_running, now);
> +            }
> +        }
> +    }
> +
> +    v->is_running = 1;

Besides this wanting to use "true", how come this is unconditional
despite the function here being used for both waking and putting to
sleep of a vCPU?

> @@ -2067,9 +2160,29 @@ static void sched_slave(void)
>  
>      now = NOW();
>  
> +    v = unit2vcpu_cpu(prev, cpu);
> +    if ( v && v->force_context_switch )
> +    {
> +        v = sched_force_context_switch(vprev, v, cpu, now);
> +
> +        if ( v )
> +        {
> +            pcpu_schedule_unlock_irq(lock, cpu);

I can't figure what it is that guarantees that this unlock isn't
going to be followed ...

> +            sched_context_switch(vprev, v, false, now);
> +        }
> +
> +        do_softirq = true;
> +    }
> +
>      if ( !prev->rendezvous_in_cnt )
>      {
>          pcpu_schedule_unlock_irq(lock, cpu);

... by another unlock here. Or wait - is sched_context_switch()
(and perhaps other functions involved there) lacking a "noreturn"
annotation?

> --- a/xen/include/xen/sched-if.h
> +++ b/xen/include/xen/sched-if.h
> @@ -100,6 +100,11 @@ static inline bool unit_runnable(const struct sched_unit *unit)
>      return false;
>  }
>  
> +static inline int vcpu_runstate_blocked(struct vcpu *v)

const?

Jan
Jürgen Groß Sept. 25, 2019, 1:07 p.m. UTC | #2
On 24.09.19 13:55, Jan Beulich wrote:
> On 14.09.2019 10:52, Juergen Gross wrote:
>> --- a/xen/common/schedule.c
>> +++ b/xen/common/schedule.c
>> @@ -724,8 +724,10 @@ void sched_destroy_domain(struct domain *d)
>>       }
>>   }
>>   
>> -void vcpu_sleep_nosync_locked(struct vcpu *v)
>> +static void vcpu_sleep_nosync_locked(struct vcpu *v)
>>   {
>> +    struct sched_unit *unit = v->sched_unit;
>> +
>>       ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
>>   
>>       if ( likely(!vcpu_runnable(v)) )
>> @@ -733,7 +735,14 @@ void vcpu_sleep_nosync_locked(struct vcpu *v)
>>           if ( v->runstate.state == RUNSTATE_runnable )
>>               vcpu_runstate_change(v, RUNSTATE_offline, NOW());
>>   
>> -        sched_sleep(vcpu_scheduler(v), v->sched_unit);
>> +        if ( likely(!unit_runnable(unit)) )
>> +            sched_sleep(vcpu_scheduler(v), unit);
> 
> unit_scheduler(unit) (also elsewhere)?

Yes.

> 
>> @@ -765,16 +774,22 @@ void vcpu_wake(struct vcpu *v)
>>   {
>>       unsigned long flags;
>>       spinlock_t *lock;
>> +    struct sched_unit *unit = v->sched_unit;
>>   
>>       TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
>>   
>> -    lock = unit_schedule_lock_irqsave(v->sched_unit, &flags);
>> +    lock = unit_schedule_lock_irqsave(unit, &flags);
>>   
>>       if ( likely(vcpu_runnable(v)) )
>>       {
>>           if ( v->runstate.state >= RUNSTATE_blocked )
>>               vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
>> -        sched_wake(vcpu_scheduler(v), v->sched_unit);
>> +        sched_wake(vcpu_scheduler(v), unit);
> 
> Is this correct / necessary when the unit is not asleep as a whole?
> After all the corresponding sched_sleep() further up is called
> conditionally only.

Oh, indeed. Will change that.

> 
>> @@ -1998,6 +2013,62 @@ static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext,
>>       context_switch(vprev, vnext);
>>   }
>>   
>> +/*
>> + * Force a context switch of a single vcpu of an unit.
>> + * Might be called either if a vcpu of an already running unit is woken up
>> + * or if a vcpu of a running unit is put asleep with other vcpus of the same
>> + * unit still running.
>> + */
>> +static struct vcpu *sched_force_context_switch(struct vcpu *vprev,
>> +                                               struct vcpu *v,
>> +                                               int cpu, s_time_t now)
> 
> unsigned int cpu? (Aiui it's suppose to equal smp_processor_id()
> anyway.)

Yes and yes.

> 
>> +{
>> +    v->force_context_switch = false;
>> +
>> +    if ( vcpu_runnable(v) == v->is_running )
>> +        return NULL;
> 
> This and other NULL returns suggest that the comment ahead of the
> function might better state what the return value here is / means.

Okay.

> 
>> +    if ( vcpu_runnable(v) )
>> +    {
>> +        if ( is_idle_vcpu(vprev) )
>> +        {
>> +            vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
>> +            vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
>> +        }
>> +        vcpu_runstate_change(v, RUNSTATE_running, now);
>> +    }
>> +    else
>> +    {
>> +        /* Make sure not to switch last vcpu of an unit away. */
>> +        if ( unit_running(v->sched_unit) == 1 )
>> +            return NULL;
>> +
>> +        v->new_state = vcpu_runstate_blocked(v);
>> +        vcpu_runstate_change(v, v->new_state, now);
>> +        v = sched_unit2vcpu_cpu(vprev->sched_unit, cpu);
>> +        if ( v != vprev )
>> +        {
>> +            if ( is_idle_vcpu(vprev) )
>> +            {
>> +                vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
>> +                vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
>> +            }
>> +            else
>> +            {
>> +                v->sched_unit = vprev->sched_unit;
>> +                vcpu_runstate_change(v, RUNSTATE_running, now);
>> +            }
>> +        }
>> +    }
>> +
>> +    v->is_running = 1;
> 
> Besides this wanting to use "true", how come this is unconditional
> despite the function here being used for both waking and putting to
> sleep of a vCPU?

At that time v is the vcpu which will be running next, so either the
just woken up one, or the idle vcpu. I can add a comment.

> 
>> @@ -2067,9 +2160,29 @@ static void sched_slave(void)
>>   
>>       now = NOW();
>>   
>> +    v = unit2vcpu_cpu(prev, cpu);
>> +    if ( v && v->force_context_switch )
>> +    {
>> +        v = sched_force_context_switch(vprev, v, cpu, now);
>> +
>> +        if ( v )
>> +        {
>> +            pcpu_schedule_unlock_irq(lock, cpu);
> 
> I can't figure what it is that guarantees that this unlock isn't
> going to be followed ...
> 
>> +            sched_context_switch(vprev, v, false, now);
>> +        }
>> +
>> +        do_softirq = true;
>> +    }
>> +
>>       if ( !prev->rendezvous_in_cnt )
>>       {
>>           pcpu_schedule_unlock_irq(lock, cpu);
> 
> ... by another unlock here. Or wait - is sched_context_switch()
> (and perhaps other functions involved there) lacking a "noreturn"
> annotation?

Indeed it is. Like context_switch() today. :-)

I'll annotate the functions.

> 
>> --- a/xen/include/xen/sched-if.h
>> +++ b/xen/include/xen/sched-if.h
>> @@ -100,6 +100,11 @@ static inline bool unit_runnable(const struct sched_unit *unit)
>>       return false;
>>   }
>>   
>> +static inline int vcpu_runstate_blocked(struct vcpu *v)
> 
> const?

Yes.


Juergen
Jürgen Groß Sept. 27, 2019, 4:42 a.m. UTC | #3
On 25.09.19 15:07, Jürgen Groß wrote:
> On 24.09.19 13:55, Jan Beulich wrote:
>> On 14.09.2019 10:52, Juergen Gross wrote:
>>> @@ -765,16 +774,22 @@ void vcpu_wake(struct vcpu *v)
>>>   {
>>>       unsigned long flags;
>>>       spinlock_t *lock;
>>> +    struct sched_unit *unit = v->sched_unit;
>>>       TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
>>> -    lock = unit_schedule_lock_irqsave(v->sched_unit, &flags);
>>> +    lock = unit_schedule_lock_irqsave(unit, &flags);
>>>       if ( likely(vcpu_runnable(v)) )
>>>       {
>>>           if ( v->runstate.state >= RUNSTATE_blocked )
>>>               vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
>>> -        sched_wake(vcpu_scheduler(v), v->sched_unit);
>>> +        sched_wake(vcpu_scheduler(v), unit);
>>
>> Is this correct / necessary when the unit is not asleep as a whole?
>> After all the corresponding sched_sleep() further up is called
>> conditionally only.
> 
> Oh, indeed. Will change that.

It turned out this is not so easy at it seemed.

I encountered dom0 boot hangs with making the call conditional, even
when running in cpu scheduling mode. I guess the reason is that a vcpu
can call do_poll() which will try to put itself to sleep and in some
cases call vcpu_wake() in case the condition already changed. In that
case we need the sched_wake() call even if the unit is still running.


>>> @@ -2067,9 +2160,29 @@ static void sched_slave(void)
>>>       now = NOW();
>>> +    v = unit2vcpu_cpu(prev, cpu);
>>> +    if ( v && v->force_context_switch )
>>> +    {
>>> +        v = sched_force_context_switch(vprev, v, cpu, now);
>>> +
>>> +        if ( v )
>>> +        {
>>> +            pcpu_schedule_unlock_irq(lock, cpu);
>>
>> I can't figure what it is that guarantees that this unlock isn't
>> going to be followed ...
>>
>>> +            sched_context_switch(vprev, v, false, now);
>>> +        }
>>> +
>>> +        do_softirq = true;
>>> +    }
>>> +
>>>       if ( !prev->rendezvous_in_cnt )
>>>       {
>>>           pcpu_schedule_unlock_irq(lock, cpu);
>>
>> ... by another unlock here. Or wait - is sched_context_switch()
>> (and perhaps other functions involved there) lacking a "noreturn"
>> annotation?
> 
> Indeed it is. Like context_switch() today. :-)
> 
> I'll annotate the functions.

And now I discovered that on ARM context_switch is _not_ "noreturn".
So thanks for noticing that problem. I have fixed it in order to
avoid a latent problem in case we want to support core scheduling on
ARM some day (and yes: that would only have been a problem in core
mode).


Juergen
Dario Faggioli Sept. 27, 2019, 7:32 a.m. UTC | #4
On Fri, 2019-09-27 at 06:42 +0200, Jürgen Groß wrote:
> On 25.09.19 15:07, Jürgen Groß wrote:
> > On 24.09.19 13:55, Jan Beulich wrote:
> > > On 14.09.2019 10:52, Juergen Gross wrote:
> > > > @@ -765,16 +774,22 @@ void vcpu_wake(struct vcpu *v)
> > > >   {
> > > >       unsigned long flags;
> > > >       spinlock_t *lock;
> > > > +    struct sched_unit *unit = v->sched_unit;
> > > >       TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v-
> > > > >vcpu_id);
> > > > -    lock = unit_schedule_lock_irqsave(v->sched_unit, &flags);
> > > > +    lock = unit_schedule_lock_irqsave(unit, &flags);
> > > >       if ( likely(vcpu_runnable(v)) )
> > > >       {
> > > >           if ( v->runstate.state >= RUNSTATE_blocked )
> > > >               vcpu_runstate_change(v, RUNSTATE_runnable,
> > > > NOW());
> > > > -        sched_wake(vcpu_scheduler(v), v->sched_unit);
> > > > +        sched_wake(vcpu_scheduler(v), unit);
> > > 
> > > Is this correct / necessary when the unit is not asleep as a
> > > whole?
> > > After all the corresponding sched_sleep() further up is called
> > > conditionally only.
> > 
> > Oh, indeed. Will change that.
> 
> It turned out this is not so easy at it seemed.
> 
> I encountered dom0 boot hangs with making the call conditional, even
> when running in cpu scheduling mode. I guess the reason is that a
> vcpu
> can call do_poll() which will try to put itself to sleep and in some
> cases call vcpu_wake() in case the condition already changed. In that
> case we need the sched_wake() call even if the unit is still running.
> 
TBH, I think it is ok for this call to be unconditional. Indeed it
looks a bit weird when you compare this to the sched_sleep() calls in
vcpu_sleep_nosync_locked(), as they are conditional, but I think a
comment explaining why this has to be the case would be enough.

E.g., something like what the changelog already say, in
vcpu_sleep_nosync_locked(), and maybe something like what you said
here, in vcpu_wake().

Regards
Jürgen Groß Sept. 27, 2019, 7:48 a.m. UTC | #5
On 27.09.19 09:32, Dario Faggioli wrote:
> On Fri, 2019-09-27 at 06:42 +0200, Jürgen Groß wrote:
>> On 25.09.19 15:07, Jürgen Groß wrote:
>>> On 24.09.19 13:55, Jan Beulich wrote:
>>>> On 14.09.2019 10:52, Juergen Gross wrote:
>>>>> @@ -765,16 +774,22 @@ void vcpu_wake(struct vcpu *v)
>>>>>    {
>>>>>        unsigned long flags;
>>>>>        spinlock_t *lock;
>>>>> +    struct sched_unit *unit = v->sched_unit;
>>>>>        TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v-
>>>>>> vcpu_id);
>>>>> -    lock = unit_schedule_lock_irqsave(v->sched_unit, &flags);
>>>>> +    lock = unit_schedule_lock_irqsave(unit, &flags);
>>>>>        if ( likely(vcpu_runnable(v)) )
>>>>>        {
>>>>>            if ( v->runstate.state >= RUNSTATE_blocked )
>>>>>                vcpu_runstate_change(v, RUNSTATE_runnable,
>>>>> NOW());
>>>>> -        sched_wake(vcpu_scheduler(v), v->sched_unit);
>>>>> +        sched_wake(vcpu_scheduler(v), unit);
>>>>
>>>> Is this correct / necessary when the unit is not asleep as a
>>>> whole?
>>>> After all the corresponding sched_sleep() further up is called
>>>> conditionally only.
>>>
>>> Oh, indeed. Will change that.
>>
>> It turned out this is not so easy at it seemed.
>>
>> I encountered dom0 boot hangs with making the call conditional, even
>> when running in cpu scheduling mode. I guess the reason is that a
>> vcpu
>> can call do_poll() which will try to put itself to sleep and in some
>> cases call vcpu_wake() in case the condition already changed. In that
>> case we need the sched_wake() call even if the unit is still running.
>>
> TBH, I think it is ok for this call to be unconditional. Indeed it
> looks a bit weird when you compare this to the sched_sleep() calls in
> vcpu_sleep_nosync_locked(), as they are conditional, but I think a
> comment explaining why this has to be the case would be enough.
> 
> E.g., something like what the changelog already say, in
> vcpu_sleep_nosync_locked(), and maybe something like what you said
> here, in vcpu_wake().

Okay, will add comments.


Juergen
diff mbox series

Patch

diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index d53c60b966..1e793617ec 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -724,8 +724,10 @@  void sched_destroy_domain(struct domain *d)
     }
 }
 
-void vcpu_sleep_nosync_locked(struct vcpu *v)
+static void vcpu_sleep_nosync_locked(struct vcpu *v)
 {
+    struct sched_unit *unit = v->sched_unit;
+
     ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
 
     if ( likely(!vcpu_runnable(v)) )
@@ -733,7 +735,14 @@  void vcpu_sleep_nosync_locked(struct vcpu *v)
         if ( v->runstate.state == RUNSTATE_runnable )
             vcpu_runstate_change(v, RUNSTATE_offline, NOW());
 
-        sched_sleep(vcpu_scheduler(v), v->sched_unit);
+        if ( likely(!unit_runnable(unit)) )
+            sched_sleep(vcpu_scheduler(v), unit);
+        else if ( unit_running(unit) > 1 && v->is_running &&
+                  !v->force_context_switch )
+        {
+            v->force_context_switch = true;
+            cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
+        }
     }
 }
 
@@ -765,16 +774,22 @@  void vcpu_wake(struct vcpu *v)
 {
     unsigned long flags;
     spinlock_t *lock;
+    struct sched_unit *unit = v->sched_unit;
 
     TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
 
-    lock = unit_schedule_lock_irqsave(v->sched_unit, &flags);
+    lock = unit_schedule_lock_irqsave(unit, &flags);
 
     if ( likely(vcpu_runnable(v)) )
     {
         if ( v->runstate.state >= RUNSTATE_blocked )
             vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
-        sched_wake(vcpu_scheduler(v), v->sched_unit);
+        sched_wake(vcpu_scheduler(v), unit);
+        if ( unit->is_running && !v->is_running && !v->force_context_switch )
+        {
+            v->force_context_switch = true;
+            cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
+        }
     }
     else if ( !(v->pause_flags & VPF_blocked) )
     {
@@ -782,7 +797,7 @@  void vcpu_wake(struct vcpu *v)
             vcpu_runstate_change(v, RUNSTATE_offline, NOW());
     }
 
-    unit_schedule_unlock_irqrestore(lock, flags, v->sched_unit);
+    unit_schedule_unlock_irqrestore(lock, flags, unit);
 }
 
 void vcpu_unblock(struct vcpu *v)
@@ -1998,6 +2013,62 @@  static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext,
     context_switch(vprev, vnext);
 }
 
+/*
+ * Force a context switch of a single vcpu of an unit.
+ * Might be called either if a vcpu of an already running unit is woken up
+ * or if a vcpu of a running unit is put asleep with other vcpus of the same
+ * unit still running.
+ */
+static struct vcpu *sched_force_context_switch(struct vcpu *vprev,
+                                               struct vcpu *v,
+                                               int cpu, s_time_t now)
+{
+    v->force_context_switch = false;
+
+    if ( vcpu_runnable(v) == v->is_running )
+        return NULL;
+
+    if ( vcpu_runnable(v) )
+    {
+        if ( is_idle_vcpu(vprev) )
+        {
+            vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
+            vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
+        }
+        vcpu_runstate_change(v, RUNSTATE_running, now);
+    }
+    else
+    {
+        /* Make sure not to switch last vcpu of an unit away. */
+        if ( unit_running(v->sched_unit) == 1 )
+            return NULL;
+
+        v->new_state = vcpu_runstate_blocked(v);
+        vcpu_runstate_change(v, v->new_state, now);
+        v = sched_unit2vcpu_cpu(vprev->sched_unit, cpu);
+        if ( v != vprev )
+        {
+            if ( is_idle_vcpu(vprev) )
+            {
+                vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
+                vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
+            }
+            else
+            {
+                v->sched_unit = vprev->sched_unit;
+                vcpu_runstate_change(v, RUNSTATE_running, now);
+            }
+        }
+    }
+
+    v->is_running = 1;
+
+    /* Make sure not to loose another slave call. */
+    raise_softirq(SCHED_SLAVE_SOFTIRQ);
+
+    return v;
+}
+
 /*
  * Rendezvous before taking a scheduling decision.
  * Called with schedule lock held, so all accesses to the rendezvous counter
@@ -2013,6 +2084,7 @@  static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
                                                    s_time_t now)
 {
     struct sched_unit *next;
+    struct vcpu *v;
 
     if ( !--prev->rendezvous_in_cnt )
     {
@@ -2021,8 +2093,28 @@  static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
         return next;
     }
 
+    v = unit2vcpu_cpu(prev, cpu);
     while ( prev->rendezvous_in_cnt )
     {
+        if ( v && v->force_context_switch )
+        {
+            struct vcpu *vprev = current;
+
+            v = sched_force_context_switch(vprev, v, cpu, now);
+
+            if ( v )
+            {
+                /* We'll come back another time, so adjust rendezvous_in_cnt. */
+                prev->rendezvous_in_cnt++;
+                atomic_set(&prev->rendezvous_out_cnt, 0);
+
+                pcpu_schedule_unlock_irq(*lock, cpu);
+
+                sched_context_switch(vprev, v, false, now);
+            }
+
+            v = unit2vcpu_cpu(prev, cpu);
+        }
         /*
          * Coming from idle might need to do tasklet work.
          * In order to avoid deadlocks we can't do that here, but have to
@@ -2055,10 +2147,11 @@  static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
 
 static void sched_slave(void)
 {
-    struct vcpu          *vprev = current;
+    struct vcpu          *v, *vprev = current;
     struct sched_unit    *prev = vprev->sched_unit, *next;
     s_time_t              now;
     spinlock_t           *lock;
+    bool                  do_softirq = false;
     unsigned int          cpu = smp_processor_id();
 
     ASSERT_NOT_IN_ATOMIC();
@@ -2067,9 +2160,29 @@  static void sched_slave(void)
 
     now = NOW();
 
+    v = unit2vcpu_cpu(prev, cpu);
+    if ( v && v->force_context_switch )
+    {
+        v = sched_force_context_switch(vprev, v, cpu, now);
+
+        if ( v )
+        {
+            pcpu_schedule_unlock_irq(lock, cpu);
+
+            sched_context_switch(vprev, v, false, now);
+        }
+
+        do_softirq = true;
+    }
+
     if ( !prev->rendezvous_in_cnt )
     {
         pcpu_schedule_unlock_irq(lock, cpu);
+
+        /* Check for failed forced context switch. */
+        if ( do_softirq )
+            raise_softirq(SCHEDULE_SOFTIRQ);
+
         return;
     }
 
diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h
index 2929154d35..655eb3af32 100644
--- a/xen/include/xen/sched-if.h
+++ b/xen/include/xen/sched-if.h
@@ -100,6 +100,11 @@  static inline bool unit_runnable(const struct sched_unit *unit)
     return false;
 }
 
+static inline int vcpu_runstate_blocked(struct vcpu *v)
+{
+    return (v->pause_flags & VPF_blocked) ? RUNSTATE_blocked : RUNSTATE_offline;
+}
+
 static inline bool unit_runnable_state(const struct sched_unit *unit)
 {
     struct vcpu *v;
@@ -112,9 +117,7 @@  static inline bool unit_runnable_state(const struct sched_unit *unit)
     {
         runnable = vcpu_runnable(v);
 
-        v->new_state = runnable ? RUNSTATE_running
-                                : (v->pause_flags & VPF_blocked)
-                                  ? RUNSTATE_blocked : RUNSTATE_offline;
+        v->new_state = runnable ? RUNSTATE_running : vcpu_runstate_blocked(v);
 
         if ( runnable )
             ret = true;
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 144d353447..f276ec9398 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -186,6 +186,8 @@  struct vcpu
     bool             is_running;
     /* VCPU should wake fast (do not deep sleep the CPU). */
     bool             is_urgent;
+    /* VCPU must context_switch without scheduling unit. */
+    bool             force_context_switch;
 
 #ifdef VCPU_TRAP_LAST
 #define VCPU_TRAP_NONE    0