diff mbox series

[v3,32/47] xen/sched: support allocating multiple vcpus into one sched unit

Message ID 20190914085251.18816-33-jgross@suse.com (mailing list archive)
State Superseded
Headers show
Series xen: add core scheduling support | expand

Commit Message

Jürgen Groß Sept. 14, 2019, 8:52 a.m. UTC
With a scheduling granularity greater than 1 multiple vcpus share the
same struct sched_unit. Support that.

Setting the initial processor must be done carefully: we can't use
sched_set_res() as that relies on for_each_sched_unit_vcpu() which in
turn needs the vcpu already as a member of the domain's vcpu linked
list, which isn't the case.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
 xen/common/schedule.c | 86 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 65 insertions(+), 21 deletions(-)

Comments

Jan Beulich Sept. 24, 2019, 9:46 a.m. UTC | #1
On 14.09.2019 10:52, Juergen Gross wrote:
> @@ -366,18 +380,38 @@ static void sched_free_unit(struct sched_unit *unit)
>      xfree(unit);
>  }
>  
> +static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v)
> +{
> +    v->sched_unit = unit;
> +    if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id )

Is the right side needed? Aren't vCPU-s created in increasing order
of their IDs, and aren't we relying on this elsewhere too?

> +    {
> +        unit->vcpu_list = v;
> +        unit->unit_id = v->vcpu_id;

This makes for a pretty strange set of IDs (non-successive), and
explains why patch 24 uses a local "unit_idx" instead of switching
from v->vcpu_id as array index to unit->unit_id. Is there a reason
you don't divide by the granularity here, eliminating the division
done e.g. ...

> +    }
> +    unit->runstate_cnt[v->runstate.state]++;
> +}
> +
>  static struct sched_unit *sched_alloc_unit(struct vcpu *v)
>  {
>      struct sched_unit *unit, **prev_unit;
>      struct domain *d = v->domain;
>  
> +    for_each_sched_unit ( d, unit )
> +        if ( unit->vcpu_list->vcpu_id / sched_granularity ==

... here. (I also don't see why you don't use unit->unit_id here.)

> @@ -622,9 +659,16 @@ void sched_destroy_vcpu(struct vcpu *v)
>      kill_timer(&v->poll_timer);
>      if ( test_and_clear_bool(v->is_urgent) )
>          atomic_dec(&per_cpu(sched_urgent_count, v->processor));
> -    sched_remove_unit(vcpu_scheduler(v), unit);
> -    sched_free_vdata(vcpu_scheduler(v), unit->priv);
> -    sched_free_unit(unit);
> +    /*
> +     * Vcpus are being destroyed top-down. So being the first vcpu of an unit
> +     * is the same as being the only one.
> +     */
> +    if ( unit->vcpu_list == v )

Interestingly here you rely on there being a certain order.

Jan
Jürgen Groß Sept. 24, 2019, 10:06 a.m. UTC | #2
On 24.09.19 11:46, Jan Beulich wrote:
> On 14.09.2019 10:52, Juergen Gross wrote:
>> @@ -366,18 +380,38 @@ static void sched_free_unit(struct sched_unit *unit)
>>       xfree(unit);
>>   }
>>   
>> +static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v)
>> +{
>> +    v->sched_unit = unit;
>> +    if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id )
> 
> Is the right side needed? Aren't vCPU-s created in increasing order
> of their IDs, and aren't we relying on this elsewhere too?

Idle vcpus are rather special and they require the second test.

> 
>> +    {
>> +        unit->vcpu_list = v;
>> +        unit->unit_id = v->vcpu_id;
> 
> This makes for a pretty strange set of IDs (non-successive), and
> explains why patch 24 uses a local "unit_idx" instead of switching
> from v->vcpu_id as array index to unit->unit_id. Is there a reason
> you don't divide by the granularity here, eliminating the division
> done e.g. ...

Cpus not in a cpupool are in single-vcpu units, so in order for not
having completely weird unit-ids after having move cpus a lot in and
out of cpupools keeping the current scheme is the only one I could
think of.

> 
>> +    }
>> +    unit->runstate_cnt[v->runstate.state]++;
>> +}
>> +
>>   static struct sched_unit *sched_alloc_unit(struct vcpu *v)
>>   {
>>       struct sched_unit *unit, **prev_unit;
>>       struct domain *d = v->domain;
>>   
>> +    for_each_sched_unit ( d, unit )
>> +        if ( unit->vcpu_list->vcpu_id / sched_granularity ==
> 
> ... here. (I also don't see why you don't use unit->unit_id here.)
> 
>> @@ -622,9 +659,16 @@ void sched_destroy_vcpu(struct vcpu *v)
>>       kill_timer(&v->poll_timer);
>>       if ( test_and_clear_bool(v->is_urgent) )
>>           atomic_dec(&per_cpu(sched_urgent_count, v->processor));
>> -    sched_remove_unit(vcpu_scheduler(v), unit);
>> -    sched_free_vdata(vcpu_scheduler(v), unit->priv);
>> -    sched_free_unit(unit);
>> +    /*
>> +     * Vcpus are being destroyed top-down. So being the first vcpu of an unit
>> +     * is the same as being the only one.
>> +     */
>> +    if ( unit->vcpu_list == v )
> 
> Interestingly here you rely on there being a certain order.

Yes. That is working because idle vcpus are never destroyed.


Juergen
Jan Beulich Sept. 24, 2019, 10:13 a.m. UTC | #3
On 24.09.2019 12:06, Jürgen Groß wrote:
> On 24.09.19 11:46, Jan Beulich wrote:
>> On 14.09.2019 10:52, Juergen Gross wrote:
>>> @@ -366,18 +380,38 @@ static void sched_free_unit(struct sched_unit *unit)
>>>       xfree(unit);
>>>   }
>>>   
>>> +static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v)
>>> +{
>>> +    v->sched_unit = unit;
>>> +    if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id )
>>
>> Is the right side needed? Aren't vCPU-s created in increasing order
>> of their IDs, and aren't we relying on this elsewhere too?
> 
> Idle vcpus are rather special and they require the second test.

How about a code comment to this effect?

>>> +    {
>>> +        unit->vcpu_list = v;
>>> +        unit->unit_id = v->vcpu_id;
>>
>> This makes for a pretty strange set of IDs (non-successive), and
>> explains why patch 24 uses a local "unit_idx" instead of switching
>> from v->vcpu_id as array index to unit->unit_id. Is there a reason
>> you don't divide by the granularity here, eliminating the division
>> done e.g. ...
> 
> Cpus not in a cpupool are in single-vcpu units, so in order for not
> having completely weird unit-ids after having move cpus a lot in and
> out of cpupools keeping the current scheme is the only one I could
> think of.

And how about extending the description to include this?

>>> +    }
>>> +    unit->runstate_cnt[v->runstate.state]++;
>>> +}
>>> +
>>>   static struct sched_unit *sched_alloc_unit(struct vcpu *v)
>>>   {
>>>       struct sched_unit *unit, **prev_unit;
>>>       struct domain *d = v->domain;
>>>   
>>> +    for_each_sched_unit ( d, unit )
>>> +        if ( unit->vcpu_list->vcpu_id / sched_granularity ==
>>
>> ... here. (I also don't see why you don't use unit->unit_id here.)

And is there a reason not to use unit->unit_id here then, which
is slightly cheaper to access?

Jan
Jürgen Groß Sept. 24, 2019, 3:13 p.m. UTC | #4
On 24.09.19 12:13, Jan Beulich wrote:
> On 24.09.2019 12:06, Jürgen Groß wrote:
>> On 24.09.19 11:46, Jan Beulich wrote:
>>> On 14.09.2019 10:52, Juergen Gross wrote:
>>>> @@ -366,18 +380,38 @@ static void sched_free_unit(struct sched_unit *unit)
>>>>        xfree(unit);
>>>>    }
>>>>    
>>>> +static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v)
>>>> +{
>>>> +    v->sched_unit = unit;
>>>> +    if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id )
>>>
>>> Is the right side needed? Aren't vCPU-s created in increasing order
>>> of their IDs, and aren't we relying on this elsewhere too?
>>
>> Idle vcpus are rather special and they require the second test.
> 
> How about a code comment to this effect?

Okay.

> 
>>>> +    {
>>>> +        unit->vcpu_list = v;
>>>> +        unit->unit_id = v->vcpu_id;
>>>
>>> This makes for a pretty strange set of IDs (non-successive), and
>>> explains why patch 24 uses a local "unit_idx" instead of switching
>>> from v->vcpu_id as array index to unit->unit_id. Is there a reason
>>> you don't divide by the granularity here, eliminating the division
>>> done e.g. ...
>>
>> Cpus not in a cpupool are in single-vcpu units, so in order for not
>> having completely weird unit-ids after having move cpus a lot in and
>> out of cpupools keeping the current scheme is the only one I could
>> think of.
> 
> And how about extending the description to include this?

Okay.

> 
>>>> +    }
>>>> +    unit->runstate_cnt[v->runstate.state]++;
>>>> +}
>>>> +
>>>>    static struct sched_unit *sched_alloc_unit(struct vcpu *v)
>>>>    {
>>>>        struct sched_unit *unit, **prev_unit;
>>>>        struct domain *d = v->domain;
>>>>    
>>>> +    for_each_sched_unit ( d, unit )
>>>> +        if ( unit->vcpu_list->vcpu_id / sched_granularity ==
>>>
>>> ... here. (I also don't see why you don't use unit->unit_id here.)
> 
> And is there a reason not to use unit->unit_id here then, which
> is slightly cheaper to access?

Right, will change.


Juergen
diff mbox series

Patch

diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 87660c6978..5e34008ca8 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -338,10 +338,26 @@  static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
     spin_unlock_irqrestore(lock1, flags);
 }
 
-static void sched_free_unit(struct sched_unit *unit)
+static void sched_free_unit(struct sched_unit *unit, struct vcpu *v)
 {
     struct sched_unit *prev_unit;
     struct domain *d = unit->domain;
+    struct vcpu *vunit;
+    unsigned int cnt = 0;
+
+    /* Don't count to be released vcpu, might be not in vcpu list yet. */
+    for_each_sched_unit_vcpu ( unit, vunit )
+        if ( vunit != v )
+            cnt++;
+
+    v->sched_unit = NULL;
+    unit->runstate_cnt[v->runstate.state]--;
+
+    if ( cnt )
+        return;
+
+    if ( unit->vcpu_list == v )
+        unit->vcpu_list = v->next_in_list;
 
     if ( d->sched_unit_list == unit )
         d->sched_unit_list = unit->next_in_list;
@@ -357,8 +373,6 @@  static void sched_free_unit(struct sched_unit *unit)
         }
     }
 
-    unit->vcpu_list->sched_unit = NULL;
-
     free_cpumask_var(unit->cpu_hard_affinity);
     free_cpumask_var(unit->cpu_hard_affinity_saved);
     free_cpumask_var(unit->cpu_soft_affinity);
@@ -366,18 +380,38 @@  static void sched_free_unit(struct sched_unit *unit)
     xfree(unit);
 }
 
+static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v)
+{
+    v->sched_unit = unit;
+    if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id )
+    {
+        unit->vcpu_list = v;
+        unit->unit_id = v->vcpu_id;
+    }
+    unit->runstate_cnt[v->runstate.state]++;
+}
+
 static struct sched_unit *sched_alloc_unit(struct vcpu *v)
 {
     struct sched_unit *unit, **prev_unit;
     struct domain *d = v->domain;
 
+    for_each_sched_unit ( d, unit )
+        if ( unit->vcpu_list->vcpu_id / sched_granularity ==
+             v->vcpu_id / sched_granularity )
+            break;
+
+    if ( unit )
+    {
+        sched_unit_add_vcpu(unit, v);
+        return unit;
+    }
+
     if ( (unit = xzalloc(struct sched_unit)) == NULL )
         return NULL;
 
-    unit->vcpu_list = v;
-    unit->unit_id = v->vcpu_id;
     unit->domain = d;
-    unit->runstate_cnt[v->runstate.state]++;
+    sched_unit_add_vcpu(unit, v);
 
     for ( prev_unit = &d->sched_unit_list; *prev_unit;
           prev_unit = &(*prev_unit)->next_in_list )
@@ -393,12 +427,10 @@  static struct sched_unit *sched_alloc_unit(struct vcpu *v)
          !zalloc_cpumask_var(&unit->cpu_soft_affinity) )
         goto fail;
 
-    v->sched_unit = unit;
-
     return unit;
 
  fail:
-    sched_free_unit(unit);
+    sched_free_unit(unit, v);
     return NULL;
 }
 
@@ -448,21 +480,26 @@  int sched_init_vcpu(struct vcpu *v)
     else
         processor = sched_select_initial_cpu(v);
 
-    sched_set_res(unit, get_sched_res(processor));
-
     /* Initialise the per-vcpu timers. */
     spin_lock_init(&v->periodic_timer_lock);
-    init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
-               v, v->processor);
-    init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn,
-               v, v->processor);
-    init_timer(&v->poll_timer, poll_timer_fn,
-               v, v->processor);
+    init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, v, processor);
+    init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, v, processor);
+    init_timer(&v->poll_timer, poll_timer_fn, v, processor);
+
+    /* If this is not the first vcpu of the unit we are done. */
+    if ( unit->priv != NULL )
+    {
+        v->processor = processor;
+        return 0;
+    }
+
+    /* The first vcpu of an unit can be set via sched_set_res(). */
+    sched_set_res(unit, get_sched_res(processor));
 
     unit->priv = sched_alloc_vdata(dom_scheduler(d), unit, d->sched_priv);
     if ( unit->priv == NULL )
     {
-        sched_free_unit(unit);
+        sched_free_unit(unit, v);
         return 1;
     }
 
@@ -622,9 +659,16 @@  void sched_destroy_vcpu(struct vcpu *v)
     kill_timer(&v->poll_timer);
     if ( test_and_clear_bool(v->is_urgent) )
         atomic_dec(&per_cpu(sched_urgent_count, v->processor));
-    sched_remove_unit(vcpu_scheduler(v), unit);
-    sched_free_vdata(vcpu_scheduler(v), unit->priv);
-    sched_free_unit(unit);
+    /*
+     * Vcpus are being destroyed top-down. So being the first vcpu of an unit
+     * is the same as being the only one.
+     */
+    if ( unit->vcpu_list == v )
+    {
+        sched_remove_unit(vcpu_scheduler(v), unit);
+        sched_free_vdata(vcpu_scheduler(v), unit->priv);
+        sched_free_unit(unit, v);
+    }
 }
 
 int sched_init_domain(struct domain *d, int poolid)