diff mbox series

[v3,24/47] xen: switch from for_each_vcpu() to for_each_sched_unit()

Message ID 20190914085251.18816-25-jgross@suse.com (mailing list archive)
State Superseded
Headers show
Series xen: add core scheduling support | expand

Commit Message

Jürgen Groß Sept. 14, 2019, 8:52 a.m. UTC
Where appropriate switch from for_each_vcpu() to for_each_sched_unit()
in order to prepare core scheduling.

As it is beneficial once here and for sure in future add a
unit_scheduler() helper and let vcpu_scheduler() use it.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
V2:
- handle affinity_broken correctly (Jan Beulich)
- add unit_scheduler() (Jan Beulich)
V3:
- add const (Jan Beulich)
- add TODOs for missing multiple vcpu per unit support (Jan Beulich)
---
 xen/common/domain.c   |   9 ++-
 xen/common/schedule.c | 158 +++++++++++++++++++++++++++++++-------------------
 2 files changed, 103 insertions(+), 64 deletions(-)

Comments

Jan Beulich Sept. 20, 2019, 3:05 p.m. UTC | #1
On 14.09.2019 10:52, Juergen Gross wrote:
> @@ -508,25 +515,27 @@ int sched_move_domain(struct domain *d, struct cpupool *c)
>      if ( IS_ERR(domdata) )
>          return PTR_ERR(domdata);
>  
> -    vcpu_priv = xzalloc_array(void *, d->max_vcpus);
> -    if ( vcpu_priv == NULL )
> +    /* TODO: fix array size with multiple vcpus per unit. */
> +    unit_priv = xzalloc_array(void *, d->max_vcpus);
> +    if ( unit_priv == NULL )
>      {
>          sched_free_domdata(c->sched, domdata);
>          return -ENOMEM;
>      }
>  
> -    for_each_vcpu ( d, v )
> +    unit_idx = 0;
> +    for_each_sched_unit ( d, unit )
>      {
> -        vcpu_priv[v->vcpu_id] = sched_alloc_vdata(c->sched, v->sched_unit,
> -                                                  domdata);
> -        if ( vcpu_priv[v->vcpu_id] == NULL )
> +        unit_priv[unit_idx] = sched_alloc_vdata(c->sched, unit, domdata);
> +        if ( unit_priv[unit_idx] == NULL )
>          {
> -            for_each_vcpu ( d, v )
> -                xfree(vcpu_priv[v->vcpu_id]);
> -            xfree(vcpu_priv);
> +            for ( unit_idx = 0; unit_priv[unit_idx]; unit_idx++ )
> +                sched_free_vdata(c->sched, unit_priv[unit_idx]);

This is an unexpected change from xfree() to sched_free_vdata(). If
it really is correct, it should be mentioned in the description. I
can see why this might be better from an abstract pov, but it's
questionable whether arinc653's update_schedule_vcpus() really wants
calling at this point (perhaps it does, as a653sched_alloc_vdata()
also calls it).

Josh, Robert: Besides this immediate aspect I also wonder whether
said call is correct to make outside of a sched_priv->lock'ed
region, when both other instances occur inside of one (and in one
case immediately before an unlock, i.e. if the lock wasn't needed
the two steps could well be re-ordered).

Finally, at this point, shouldn't the functions and hooks (already)
be named {alloc,free}_udata()?

> @@ -896,18 +929,22 @@ void restore_vcpu_affinity(struct domain *d)
>                      cpupool_domain_cpumask(d));
>          if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
>          {
> -            if ( v->affinity_broken )
> +            if ( sched_check_affinity_broken(unit) )
>              {
> -                sched_set_affinity(v, unit->cpu_hard_affinity_saved, NULL);
> -                v->affinity_broken = 0;
> +                /* Affinity settings of one vcpu are for the complete unit. */
> +                sched_set_affinity(unit->vcpu_list,
> +                                   unit->cpu_hard_affinity_saved, NULL);

Yet despite the comment the function gets passed a struct vcpu *,
and this doesn't look to change by the end of the series. Is there
a reason for this?

> @@ -950,17 +986,19 @@ int cpu_disable_scheduler(unsigned int cpu)
>  
>      for_each_domain_in_cpupool ( d, c )
>      {
> -        for_each_vcpu ( d, v )
> +        struct sched_unit *unit;
> +
> +        for_each_sched_unit ( d, unit )
>          {
>              unsigned long flags;
> -            struct sched_unit *unit = v->sched_unit;
>              spinlock_t *lock = unit_schedule_lock_irqsave(unit, &flags);
>  
>              cpumask_and(&online_affinity, unit->cpu_hard_affinity, c->cpu_valid);
>              if ( cpumask_empty(&online_affinity) &&
>                   cpumask_test_cpu(cpu, unit->cpu_hard_affinity) )
>              {
> -                if ( v->affinity_broken )
> +                /* TODO: multiple vcpus per unit. */
> +                if ( unit->vcpu_list->affinity_broken )

Why not sched_check_affinity_broken(unit)? Quite possibly this would
make the TODO item unnecessary?

> @@ -968,14 +1006,15 @@ int cpu_disable_scheduler(unsigned int cpu)
>                      break;
>                  }
>  
> -                printk(XENLOG_DEBUG "Breaking affinity for %pv\n", v);
> +                printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
> +                       unit->vcpu_list);
>  
> -                sched_set_affinity(v, &cpumask_all, NULL);
> +                sched_set_affinity(unit->vcpu_list, &cpumask_all, NULL);
>              }
>  
> -            if ( v->processor != cpu )
> +            if ( sched_unit_cpu(unit) != sched_get_resource_cpu(cpu) )

Didn't you agree that this can be had cheaper? Quite likely my v2
review remark was on a different instance, but the pattern ought
to be relatively simple to find in the entire series (and by the
end of the series there's one other instance in schedule.c ...

> @@ -988,17 +1027,18 @@ int cpu_disable_scheduler(unsigned int cpu)
>               *  * the scheduler will always find a suitable solution, or
>               *    things would have failed before getting in here.
>               */
> -            vcpu_migrate_start(v);
> +            /* TODO: multiple vcpus per unit. */
> +            vcpu_migrate_start(unit->vcpu_list);
>              unit_schedule_unlock_irqrestore(lock, flags, unit);
>  
> -            vcpu_migrate_finish(v);
> +            vcpu_migrate_finish(unit->vcpu_list);
>  
>              /*
>               * The only caveat, in this case, is that if a vcpu active in
>               * the hypervisor isn't migratable. In this case, the caller
>               * should try again after releasing and reaquiring all locks.
>               */
> -            if ( v->processor == cpu )
> +            if ( sched_unit_cpu(unit) == sched_get_resource_cpu(cpu) )

... here; I didn't check other files).

> @@ -1009,8 +1049,8 @@ int cpu_disable_scheduler(unsigned int cpu)
>  static int cpu_disable_scheduler_check(unsigned int cpu)
>  {
>      struct domain *d;
> -    struct vcpu *v;
>      struct cpupool *c;
> +    struct vcpu *v;

Unnecessary change?

Jan
Jürgen Groß Sept. 24, 2019, 12:13 p.m. UTC | #2
On 20.09.19 17:05, Jan Beulich wrote:
> On 14.09.2019 10:52, Juergen Gross wrote:
>> @@ -508,25 +515,27 @@ int sched_move_domain(struct domain *d, struct cpupool *c)
>>       if ( IS_ERR(domdata) )
>>           return PTR_ERR(domdata);
>>   
>> -    vcpu_priv = xzalloc_array(void *, d->max_vcpus);
>> -    if ( vcpu_priv == NULL )
>> +    /* TODO: fix array size with multiple vcpus per unit. */
>> +    unit_priv = xzalloc_array(void *, d->max_vcpus);
>> +    if ( unit_priv == NULL )
>>       {
>>           sched_free_domdata(c->sched, domdata);
>>           return -ENOMEM;
>>       }
>>   
>> -    for_each_vcpu ( d, v )
>> +    unit_idx = 0;
>> +    for_each_sched_unit ( d, unit )
>>       {
>> -        vcpu_priv[v->vcpu_id] = sched_alloc_vdata(c->sched, v->sched_unit,
>> -                                                  domdata);
>> -        if ( vcpu_priv[v->vcpu_id] == NULL )
>> +        unit_priv[unit_idx] = sched_alloc_vdata(c->sched, unit, domdata);
>> +        if ( unit_priv[unit_idx] == NULL )
>>           {
>> -            for_each_vcpu ( d, v )
>> -                xfree(vcpu_priv[v->vcpu_id]);
>> -            xfree(vcpu_priv);
>> +            for ( unit_idx = 0; unit_priv[unit_idx]; unit_idx++ )
>> +                sched_free_vdata(c->sched, unit_priv[unit_idx]);
> 
> This is an unexpected change from xfree() to sched_free_vdata(). If
> it really is correct, it should be mentioned in the description. I
> can see why this might be better from an abstract pov, but it's
> questionable whether arinc653's update_schedule_vcpus() really wants
> calling at this point (perhaps it does, as a653sched_alloc_vdata()
> also calls it).

Yes, I should put this change into an own patch at the beginning
of the series (or outside of it), as it is really a bug fix.

The data allocated via sched_alloc_vdata() is put into a list by the
arinc scheduler, so just xfree()ing it is clearly wrong.

> 
> Josh, Robert: Besides this immediate aspect I also wonder whether
> said call is correct to make outside of a sched_priv->lock'ed
> region, when both other instances occur inside of one (and in one
> case immediately before an unlock, i.e. if the lock wasn't needed
> the two steps could well be re-ordered).

I guess a653sched_free_vdata() is missing the required locking.
I'll add that in the bugfix.

> Finally, at this point, shouldn't the functions and hooks (already)
> be named {alloc,free}_udata()?

Indeed they should.

> 
>> @@ -896,18 +929,22 @@ void restore_vcpu_affinity(struct domain *d)
>>                       cpupool_domain_cpumask(d));
>>           if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
>>           {
>> -            if ( v->affinity_broken )
>> +            if ( sched_check_affinity_broken(unit) )
>>               {
>> -                sched_set_affinity(v, unit->cpu_hard_affinity_saved, NULL);
>> -                v->affinity_broken = 0;
>> +                /* Affinity settings of one vcpu are for the complete unit. */
>> +                sched_set_affinity(unit->vcpu_list,
>> +                                   unit->cpu_hard_affinity_saved, NULL);
> 
> Yet despite the comment the function gets passed a struct vcpu *,
> and this doesn't look to change by the end of the series. Is there
> a reason for this?

Yes. sched_set_affinity() is used from outside of schedule.c (by
dom0_build.c).

> 
>> @@ -950,17 +986,19 @@ int cpu_disable_scheduler(unsigned int cpu)
>>   
>>       for_each_domain_in_cpupool ( d, c )
>>       {
>> -        for_each_vcpu ( d, v )
>> +        struct sched_unit *unit;
>> +
>> +        for_each_sched_unit ( d, unit )
>>           {
>>               unsigned long flags;
>> -            struct sched_unit *unit = v->sched_unit;
>>               spinlock_t *lock = unit_schedule_lock_irqsave(unit, &flags);
>>   
>>               cpumask_and(&online_affinity, unit->cpu_hard_affinity, c->cpu_valid);
>>               if ( cpumask_empty(&online_affinity) &&
>>                    cpumask_test_cpu(cpu, unit->cpu_hard_affinity) )
>>               {
>> -                if ( v->affinity_broken )
>> +                /* TODO: multiple vcpus per unit. */
>> +                if ( unit->vcpu_list->affinity_broken )
> 
> Why not sched_check_affinity_broken(unit)? Quite possibly this would
> make the TODO item unnecessary?

Oh, indeed.

> 
>> @@ -968,14 +1006,15 @@ int cpu_disable_scheduler(unsigned int cpu)
>>                       break;
>>                   }
>>   
>> -                printk(XENLOG_DEBUG "Breaking affinity for %pv\n", v);
>> +                printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
>> +                       unit->vcpu_list);
>>   
>> -                sched_set_affinity(v, &cpumask_all, NULL);
>> +                sched_set_affinity(unit->vcpu_list, &cpumask_all, NULL);
>>               }
>>   
>> -            if ( v->processor != cpu )
>> +            if ( sched_unit_cpu(unit) != sched_get_resource_cpu(cpu) )
> 
> Didn't you agree that this can be had cheaper? Quite likely my v2
> review remark was on a different instance, but the pattern ought
> to be relatively simple to find in the entire series (and by the
> end of the series there's one other instance in schedule.c ...

Will check again. Thanks for the reminder.

> 
>> @@ -988,17 +1027,18 @@ int cpu_disable_scheduler(unsigned int cpu)
>>                *  * the scheduler will always find a suitable solution, or
>>                *    things would have failed before getting in here.
>>                */
>> -            vcpu_migrate_start(v);
>> +            /* TODO: multiple vcpus per unit. */
>> +            vcpu_migrate_start(unit->vcpu_list);
>>               unit_schedule_unlock_irqrestore(lock, flags, unit);
>>   
>> -            vcpu_migrate_finish(v);
>> +            vcpu_migrate_finish(unit->vcpu_list);
>>   
>>               /*
>>                * The only caveat, in this case, is that if a vcpu active in
>>                * the hypervisor isn't migratable. In this case, the caller
>>                * should try again after releasing and reaquiring all locks.
>>                */
>> -            if ( v->processor == cpu )
>> +            if ( sched_unit_cpu(unit) == sched_get_resource_cpu(cpu) )
> 
> ... here; I didn't check other files).
> 
>> @@ -1009,8 +1049,8 @@ int cpu_disable_scheduler(unsigned int cpu)
>>   static int cpu_disable_scheduler_check(unsigned int cpu)
>>   {
>>       struct domain *d;
>> -    struct vcpu *v;
>>       struct cpupool *c;
>> +    struct vcpu *v;
> 
> Unnecessary change?

Yes.


Juergen
Jan Beulich Sept. 24, 2019, 12:31 p.m. UTC | #3
On 24.09.2019 14:13, Jürgen Groß  wrote:
> On 20.09.19 17:05, Jan Beulich wrote:
>> On 14.09.2019 10:52, Juergen Gross wrote:
>>> @@ -896,18 +929,22 @@ void restore_vcpu_affinity(struct domain *d)
>>>                       cpupool_domain_cpumask(d));
>>>           if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
>>>           {
>>> -            if ( v->affinity_broken )
>>> +            if ( sched_check_affinity_broken(unit) )
>>>               {
>>> -                sched_set_affinity(v, unit->cpu_hard_affinity_saved, NULL);
>>> -                v->affinity_broken = 0;
>>> +                /* Affinity settings of one vcpu are for the complete unit. */
>>> +                sched_set_affinity(unit->vcpu_list,
>>> +                                   unit->cpu_hard_affinity_saved, NULL);
>>
>> Yet despite the comment the function gets passed a struct vcpu *,
>> and this doesn't look to change by the end of the series. Is there
>> a reason for this?
> 
> Yes. sched_set_affinity() is used from outside of schedule.c (by
> dom0_build.c).

How about changing the call there then, rather than having confusing
code here?

Jan
Jürgen Groß Sept. 24, 2019, 3 p.m. UTC | #4
On 24.09.19 14:31, Jan Beulich wrote:
> On 24.09.2019 14:13, Jürgen Groß  wrote:
>> On 20.09.19 17:05, Jan Beulich wrote:
>>> On 14.09.2019 10:52, Juergen Gross wrote:
>>>> @@ -896,18 +929,22 @@ void restore_vcpu_affinity(struct domain *d)
>>>>                        cpupool_domain_cpumask(d));
>>>>            if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
>>>>            {
>>>> -            if ( v->affinity_broken )
>>>> +            if ( sched_check_affinity_broken(unit) )
>>>>                {
>>>> -                sched_set_affinity(v, unit->cpu_hard_affinity_saved, NULL);
>>>> -                v->affinity_broken = 0;
>>>> +                /* Affinity settings of one vcpu are for the complete unit. */
>>>> +                sched_set_affinity(unit->vcpu_list,
>>>> +                                   unit->cpu_hard_affinity_saved, NULL);
>>>
>>> Yet despite the comment the function gets passed a struct vcpu *,
>>> and this doesn't look to change by the end of the series. Is there
>>> a reason for this?
>>
>> Yes. sched_set_affinity() is used from outside of schedule.c (by
>> dom0_build.c).
> 
> How about changing the call there then, rather than having confusing
> code here?

I'm not sure that would be better.

What about dropping dom0_setup_vcpu() by calling vcpu_create() instead
and doing the pinning via a call to a new function in schedule.c after
having created all vcpus? In fact we could even do a common function
creating all but vcpu[0], doing the pinning and the updating the node
affinity. Probably this would want to be part of patch 20.


Juergen
Jan Beulich Sept. 24, 2019, 3:04 p.m. UTC | #5
On 24.09.2019 17:00, Jürgen Groß wrote:
> On 24.09.19 14:31, Jan Beulich wrote:
>> On 24.09.2019 14:13, Jürgen Groß  wrote:
>>> On 20.09.19 17:05, Jan Beulich wrote:
>>>> On 14.09.2019 10:52, Juergen Gross wrote:
>>>>> @@ -896,18 +929,22 @@ void restore_vcpu_affinity(struct domain *d)
>>>>>                        cpupool_domain_cpumask(d));
>>>>>            if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
>>>>>            {
>>>>> -            if ( v->affinity_broken )
>>>>> +            if ( sched_check_affinity_broken(unit) )
>>>>>                {
>>>>> -                sched_set_affinity(v, unit->cpu_hard_affinity_saved, NULL);
>>>>> -                v->affinity_broken = 0;
>>>>> +                /* Affinity settings of one vcpu are for the complete unit. */
>>>>> +                sched_set_affinity(unit->vcpu_list,
>>>>> +                                   unit->cpu_hard_affinity_saved, NULL);
>>>>
>>>> Yet despite the comment the function gets passed a struct vcpu *,
>>>> and this doesn't look to change by the end of the series. Is there
>>>> a reason for this?
>>>
>>> Yes. sched_set_affinity() is used from outside of schedule.c (by
>>> dom0_build.c).
>>
>> How about changing the call there then, rather than having confusing
>> code here?
> 
> I'm not sure that would be better.
> 
> What about dropping dom0_setup_vcpu() by calling vcpu_create() instead
> and doing the pinning via a call to a new function in schedule.c after
> having created all vcpus? In fact we could even do a common function
> creating all but vcpu[0], doing the pinning and the updating the node
> affinity. Probably this would want to be part of patch 20.

Sounds reasonable at the first glance.

Jan
diff mbox series

Patch

diff --git a/xen/common/domain.c b/xen/common/domain.c
index a3e23f2ee7..7a1be85be9 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -554,7 +554,7 @@  void domain_update_node_affinity(struct domain *d)
     cpumask_var_t dom_cpumask, dom_cpumask_soft;
     cpumask_t *dom_affinity;
     const cpumask_t *online;
-    struct vcpu *v;
+    struct sched_unit *unit;
     unsigned int cpu;
 
     /* Do we have vcpus already? If not, no need to update node-affinity. */
@@ -587,12 +587,11 @@  void domain_update_node_affinity(struct domain *d)
          * and the full mask of where it would prefer to run (the union of
          * the soft affinity of all its various vcpus). Let's build them.
          */
-        for_each_vcpu ( d, v )
+        for_each_sched_unit ( d, unit )
         {
-            cpumask_or(dom_cpumask, dom_cpumask,
-                       v->sched_unit->cpu_hard_affinity);
+            cpumask_or(dom_cpumask, dom_cpumask, unit->cpu_hard_affinity);
             cpumask_or(dom_cpumask_soft, dom_cpumask_soft,
-                       v->sched_unit->cpu_soft_affinity);
+                       unit->cpu_soft_affinity);
         }
         /* Filter out non-online cpus */
         cpumask_and(dom_cpumask, dom_cpumask, online);
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 9c41b2dd4d..7b37461db9 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -150,26 +150,32 @@  static inline struct scheduler *dom_scheduler(const struct domain *d)
     return &ops;
 }
 
-static inline struct scheduler *vcpu_scheduler(const struct vcpu *v)
+static inline struct scheduler *unit_scheduler(const struct sched_unit *unit)
 {
-    struct domain *d = v->domain;
+    struct domain *d = unit->domain;
 
     if ( likely(d->cpupool != NULL) )
         return d->cpupool->sched;
 
     /*
-     * If d->cpupool is NULL, this is a vCPU of the idle domain. And this
+     * If d->cpupool is NULL, this is a unit of the idle domain. And this
      * case is special because the idle domain does not really belong to
      * a cpupool and, hence, doesn't really have a scheduler). In fact, its
-     * vCPUs (may) run on pCPUs which are in different pools, with different
+     * units (may) run on pCPUs which are in different pools, with different
      * schedulers.
      *
      * What we want, in this case, is the scheduler of the pCPU where this
-     * particular idle vCPU is running. And, since v->processor never changes
-     * for idle vCPUs, it is safe to use it, with no locks, to figure that out.
+     * particular idle unit is running. And, since unit->res never changes
+     * for idle units, it is safe to use it, with no locks, to figure that out.
      */
+
     ASSERT(is_idle_domain(d));
-    return per_cpu(scheduler, v->processor);
+    return per_cpu(scheduler, unit->res->master_cpu);
+}
+
+static inline struct scheduler *vcpu_scheduler(const struct vcpu *v)
+{
+    return unit_scheduler(v->sched_unit);
 }
 #define VCPU2ONLINE(_v) cpupool_domain_cpumask((_v)->domain)
 
@@ -491,10 +497,11 @@  static void sched_move_irqs(struct sched_unit *unit)
 int sched_move_domain(struct domain *d, struct cpupool *c)
 {
     struct vcpu *v;
-    unsigned int new_p;
-    void **vcpu_priv;
+    struct sched_unit *unit;
+    unsigned int new_p, unit_idx;
+    void **unit_priv;
     void *domdata;
-    void *vcpudata;
+    void *unitdata;
     struct scheduler *old_ops;
     void *old_domdata;
 
@@ -508,25 +515,27 @@  int sched_move_domain(struct domain *d, struct cpupool *c)
     if ( IS_ERR(domdata) )
         return PTR_ERR(domdata);
 
-    vcpu_priv = xzalloc_array(void *, d->max_vcpus);
-    if ( vcpu_priv == NULL )
+    /* TODO: fix array size with multiple vcpus per unit. */
+    unit_priv = xzalloc_array(void *, d->max_vcpus);
+    if ( unit_priv == NULL )
     {
         sched_free_domdata(c->sched, domdata);
         return -ENOMEM;
     }
 
-    for_each_vcpu ( d, v )
+    unit_idx = 0;
+    for_each_sched_unit ( d, unit )
     {
-        vcpu_priv[v->vcpu_id] = sched_alloc_vdata(c->sched, v->sched_unit,
-                                                  domdata);
-        if ( vcpu_priv[v->vcpu_id] == NULL )
+        unit_priv[unit_idx] = sched_alloc_vdata(c->sched, unit, domdata);
+        if ( unit_priv[unit_idx] == NULL )
         {
-            for_each_vcpu ( d, v )
-                xfree(vcpu_priv[v->vcpu_id]);
-            xfree(vcpu_priv);
+            for ( unit_idx = 0; unit_priv[unit_idx]; unit_idx++ )
+                sched_free_vdata(c->sched, unit_priv[unit_idx]);
+            xfree(unit_priv);
             sched_free_domdata(c->sched, domdata);
             return -ENOMEM;
         }
+        unit_idx++;
     }
 
     domain_pause(d);
@@ -534,30 +543,36 @@  int sched_move_domain(struct domain *d, struct cpupool *c)
     old_ops = dom_scheduler(d);
     old_domdata = d->sched_priv;
 
-    for_each_vcpu ( d, v )
+    for_each_sched_unit ( d, unit )
     {
-        sched_remove_unit(old_ops, v->sched_unit);
+        sched_remove_unit(old_ops, unit);
     }
 
     d->cpupool = c;
     d->sched_priv = domdata;
 
     new_p = cpumask_first(c->cpu_valid);
-    for_each_vcpu ( d, v )
+    unit_idx = 0;
+    for_each_sched_unit ( d, unit )
     {
         spinlock_t *lock;
+        unsigned int unit_p = new_p;
 
-        vcpudata = v->sched_unit->priv;
+        unitdata = unit->priv;
 
-        migrate_timer(&v->periodic_timer, new_p);
-        migrate_timer(&v->singleshot_timer, new_p);
-        migrate_timer(&v->poll_timer, new_p);
+        for_each_sched_unit_vcpu ( unit, v )
+        {
+            migrate_timer(&v->periodic_timer, new_p);
+            migrate_timer(&v->singleshot_timer, new_p);
+            migrate_timer(&v->poll_timer, new_p);
+            new_p = cpumask_cycle(new_p, c->cpu_valid);
+        }
 
-        lock = unit_schedule_lock_irq(v->sched_unit);
+        lock = unit_schedule_lock_irq(unit);
 
-        sched_set_affinity(v, &cpumask_all, &cpumask_all);
+        sched_set_affinity(unit->vcpu_list, &cpumask_all, &cpumask_all);
 
-        sched_set_res(v->sched_unit, get_sched_res(new_p));
+        sched_set_res(unit, get_sched_res(unit_p));
         /*
          * With v->processor modified we must not
          * - make any further changes assuming we hold the scheduler lock,
@@ -565,15 +580,15 @@  int sched_move_domain(struct domain *d, struct cpupool *c)
          */
         spin_unlock_irq(lock);
 
-        v->sched_unit->priv = vcpu_priv[v->vcpu_id];
+        unit->priv = unit_priv[unit_idx];
         if ( !d->is_dying )
-            sched_move_irqs(v->sched_unit);
+            sched_move_irqs(unit);
 
-        new_p = cpumask_cycle(new_p, c->cpu_valid);
+        sched_insert_unit(c->sched, unit);
 
-        sched_insert_unit(c->sched, v->sched_unit);
+        sched_free_vdata(old_ops, unitdata);
 
-        sched_free_vdata(old_ops, vcpudata);
+        unit_idx++;
     }
 
     domain_update_node_affinity(d);
@@ -582,7 +597,7 @@  int sched_move_domain(struct domain *d, struct cpupool *c)
 
     sched_free_domdata(old_ops, old_domdata);
 
-    xfree(vcpu_priv);
+    xfree(unit_priv);
 
     return 0;
 }
@@ -866,18 +881,36 @@  static void vcpu_migrate_finish(struct vcpu *v)
     vcpu_wake(v);
 }
 
+static bool sched_check_affinity_broken(const struct sched_unit *unit)
+{
+    const struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        if ( v->affinity_broken )
+            return true;
+
+    return false;
+}
+
+static void sched_reset_affinity_broken(struct sched_unit *unit)
+{
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        v->affinity_broken = false;
+}
+
 void restore_vcpu_affinity(struct domain *d)
 {
     unsigned int cpu = smp_processor_id();
-    struct vcpu *v;
+    struct sched_unit *unit;
 
     ASSERT(system_state == SYS_STATE_resume);
 
-    for_each_vcpu ( d, v )
+    for_each_sched_unit ( d, unit )
     {
         spinlock_t *lock;
-        unsigned int old_cpu = v->processor;
-        struct sched_unit *unit = v->sched_unit;
+        unsigned int old_cpu = sched_unit_cpu(unit);
         struct sched_resource *res;
 
         ASSERT(!unit_runnable(unit));
@@ -896,18 +929,22 @@  void restore_vcpu_affinity(struct domain *d)
                     cpupool_domain_cpumask(d));
         if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
         {
-            if ( v->affinity_broken )
+            if ( sched_check_affinity_broken(unit) )
             {
-                sched_set_affinity(v, unit->cpu_hard_affinity_saved, NULL);
-                v->affinity_broken = 0;
+                /* Affinity settings of one vcpu are for the complete unit. */
+                sched_set_affinity(unit->vcpu_list,
+                                   unit->cpu_hard_affinity_saved, NULL);
+                sched_reset_affinity_broken(unit);
                 cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
                             cpupool_domain_cpumask(d));
             }
 
             if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
             {
-                printk(XENLOG_DEBUG "Breaking affinity for %pv\n", v);
-                sched_set_affinity(v, &cpumask_all, NULL);
+                /* Affinity settings of one vcpu are for the complete unit. */
+                printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
+                       unit->vcpu_list);
+                sched_set_affinity(unit->vcpu_list, &cpumask_all, NULL);
                 cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
                             cpupool_domain_cpumask(d));
             }
@@ -920,12 +957,12 @@  void restore_vcpu_affinity(struct domain *d)
 
         /* v->processor might have changed, so reacquire the lock. */
         lock = unit_schedule_lock_irq(unit);
-        res = sched_pick_resource(vcpu_scheduler(v), unit);
+        res = sched_pick_resource(unit_scheduler(unit), unit);
         sched_set_res(unit, res);
         spin_unlock_irq(lock);
 
-        if ( old_cpu != v->processor )
-            sched_move_irqs(v->sched_unit);
+        if ( old_cpu != sched_unit_cpu(unit) )
+            sched_move_irqs(unit);
     }
 
     domain_update_node_affinity(d);
@@ -939,7 +976,6 @@  void restore_vcpu_affinity(struct domain *d)
 int cpu_disable_scheduler(unsigned int cpu)
 {
     struct domain *d;
-    struct vcpu *v;
     struct cpupool *c;
     cpumask_t online_affinity;
     int ret = 0;
@@ -950,17 +986,19 @@  int cpu_disable_scheduler(unsigned int cpu)
 
     for_each_domain_in_cpupool ( d, c )
     {
-        for_each_vcpu ( d, v )
+        struct sched_unit *unit;
+
+        for_each_sched_unit ( d, unit )
         {
             unsigned long flags;
-            struct sched_unit *unit = v->sched_unit;
             spinlock_t *lock = unit_schedule_lock_irqsave(unit, &flags);
 
             cpumask_and(&online_affinity, unit->cpu_hard_affinity, c->cpu_valid);
             if ( cpumask_empty(&online_affinity) &&
                  cpumask_test_cpu(cpu, unit->cpu_hard_affinity) )
             {
-                if ( v->affinity_broken )
+                /* TODO: multiple vcpus per unit. */
+                if ( unit->vcpu_list->affinity_broken )
                 {
                     /* The vcpu is temporarily pinned, can't move it. */
                     unit_schedule_unlock_irqrestore(lock, flags, unit);
@@ -968,14 +1006,15 @@  int cpu_disable_scheduler(unsigned int cpu)
                     break;
                 }
 
-                printk(XENLOG_DEBUG "Breaking affinity for %pv\n", v);
+                printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
+                       unit->vcpu_list);
 
-                sched_set_affinity(v, &cpumask_all, NULL);
+                sched_set_affinity(unit->vcpu_list, &cpumask_all, NULL);
             }
 
-            if ( v->processor != cpu )
+            if ( sched_unit_cpu(unit) != sched_get_resource_cpu(cpu) )
             {
-                /* The vcpu is not on this cpu, so we can move on. */
+                /* The unit is not on this cpu, so we can move on. */
                 unit_schedule_unlock_irqrestore(lock, flags, unit);
                 continue;
             }
@@ -988,17 +1027,18 @@  int cpu_disable_scheduler(unsigned int cpu)
              *  * the scheduler will always find a suitable solution, or
              *    things would have failed before getting in here.
              */
-            vcpu_migrate_start(v);
+            /* TODO: multiple vcpus per unit. */
+            vcpu_migrate_start(unit->vcpu_list);
             unit_schedule_unlock_irqrestore(lock, flags, unit);
 
-            vcpu_migrate_finish(v);
+            vcpu_migrate_finish(unit->vcpu_list);
 
             /*
              * The only caveat, in this case, is that if a vcpu active in
              * the hypervisor isn't migratable. In this case, the caller
              * should try again after releasing and reaquiring all locks.
              */
-            if ( v->processor == cpu )
+            if ( sched_unit_cpu(unit) == sched_get_resource_cpu(cpu) )
                 ret = -EAGAIN;
         }
     }
@@ -1009,8 +1049,8 @@  int cpu_disable_scheduler(unsigned int cpu)
 static int cpu_disable_scheduler_check(unsigned int cpu)
 {
     struct domain *d;
-    struct vcpu *v;
     struct cpupool *c;
+    struct vcpu *v;
 
     c = per_cpu(cpupool, cpu);
     if ( c == NULL )