diff mbox series

[v3,30/47] xen/sched: add support for multiple vcpus per sched unit where missing

Message ID 20190914085251.18816-31-jgross@suse.com (mailing list archive)
State Superseded
Headers show
Series xen: add core scheduling support | expand

Commit Message

Jürgen Groß Sept. 14, 2019, 8:52 a.m. UTC
In several places there is support for multiple vcpus per sched unit
missing. Add that missing support (with the exception of initial
allocation) and missing helpers for that.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
RFC V2:
- fix vcpu_runstate_helper()
V1:
- add special handling for idle unit in unit_runnable() and
  unit_runnable_state()
V2:
- handle affinity_broken correctly (Jan Beulich)
V3:
- type for cpu ->unsigned int (Jan Beulich)
---
 xen/common/domain.c        |  5 +++-
 xen/common/schedule.c      | 37 +++++++++++++-------------
 xen/include/xen/sched-if.h | 65 +++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 79 insertions(+), 28 deletions(-)

Comments

Jan Beulich Sept. 23, 2019, 3:41 p.m. UTC | #1
On 14.09.2019 10:52, Juergen Gross wrote:
> @@ -266,15 +267,16 @@ static inline void vcpu_runstate_change(
>  static inline void sched_unit_runstate_change(struct sched_unit *unit,
>      bool running, s_time_t new_entry_time)
>  {
> -    struct vcpu *v = unit->vcpu_list;
> +    struct vcpu *v;
>  
> -    if ( running )
> -        vcpu_runstate_change(v, v->new_state, new_entry_time);
> -    else
> -        vcpu_runstate_change(v,
> -            ((v->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
> -             (vcpu_runnable(v) ? RUNSTATE_runnable : RUNSTATE_offline)),
> -            new_entry_time);
> +    for_each_sched_unit_vcpu ( unit, v )
> +        if ( running )
> +            vcpu_runstate_change(v, v->new_state, new_entry_time);
> +        else
> +            vcpu_runstate_change(v,
> +                ((v->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
> +                 (vcpu_runnable(v) ? RUNSTATE_runnable : RUNSTATE_offline)),
> +                new_entry_time);
>  }

As mentioned on v2 already, I'm having some difficulty seeing why a
function like this one (and some of the sched-if.h changes here)
couldn't be introduced with this loop you add now right away.

Seeing this change I'm also puzzled why ->new_state is used only in
case "running" is true. Is there anything speaking against setting
that field uniformly, and simply consuming it here in all cases?

> @@ -1031,10 +1033,9 @@ int cpu_disable_scheduler(unsigned int cpu)
>              if ( cpumask_empty(&online_affinity) &&
>                   cpumask_test_cpu(cpu, unit->cpu_hard_affinity) )
>              {
> -                /* TODO: multiple vcpus per unit. */
> -                if ( unit->vcpu_list->affinity_broken )
> +                if ( sched_check_affinity_broken(unit) )
>                  {
> -                    /* The vcpu is temporarily pinned, can't move it. */
> +                    /* The unit is temporarily pinned, can't move it. */
>                      unit_schedule_unlock_irqrestore(lock, flags, unit);

Along these lines, wouldn't this change (and further related ones)
belong into the patch introducing sched_check_affinity_broken()?

> @@ -1851,7 +1852,7 @@ void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext)
>              while ( atomic_read(&next->rendezvous_out_cnt) )
>                  cpu_relax();
>      }
> -    else if ( vprev != vnext )
> +    else if ( vprev != vnext && sched_granularity == 1 )
>          context_saved(vprev);
>  }

Would you mind helping me with understanding why this call is
needed with a granularity of 1 only?

> --- a/xen/include/xen/sched-if.h
> +++ b/xen/include/xen/sched-if.h
> @@ -68,12 +68,32 @@ static inline bool is_idle_unit(const struct sched_unit *unit)
>  
>  static inline bool is_unit_online(const struct sched_unit *unit)
>  {
> -    return is_vcpu_online(unit->vcpu_list);
> +    struct vcpu *v;

const?

> +    for_each_sched_unit_vcpu ( unit, v )
> +        if ( is_vcpu_online(v) )
> +            return true;
> +
> +    return false;
> +}
> +
> +static inline unsigned int unit_running(const struct sched_unit *unit)
> +{
> +    return unit->runstate_cnt[RUNSTATE_running];
>  }

Is there really going to be a user needing the return value be a
count, not just a boolean?

>  static inline bool unit_runnable(const struct sched_unit *unit)
>  {
> -    return vcpu_runnable(unit->vcpu_list);
> +    struct vcpu *v;

const?

> +    if ( is_idle_unit(unit) )
> +        return true;
> +
> +    for_each_sched_unit_vcpu ( unit, v )
> +        if ( vcpu_runnable(v) )
> +            return true;

Isn't the loop going to yield true anyway for idle units? If so, is
there a particular reason for the special casing of idle units up
front here?

Jan
Jürgen Groß Sept. 24, 2019, 2:41 p.m. UTC | #2
On 23.09.19 17:41, Jan Beulich wrote:
> On 14.09.2019 10:52, Juergen Gross wrote:
>> @@ -266,15 +267,16 @@ static inline void vcpu_runstate_change(
>>   static inline void sched_unit_runstate_change(struct sched_unit *unit,
>>       bool running, s_time_t new_entry_time)
>>   {
>> -    struct vcpu *v = unit->vcpu_list;
>> +    struct vcpu *v;
>>   
>> -    if ( running )
>> -        vcpu_runstate_change(v, v->new_state, new_entry_time);
>> -    else
>> -        vcpu_runstate_change(v,
>> -            ((v->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
>> -             (vcpu_runnable(v) ? RUNSTATE_runnable : RUNSTATE_offline)),
>> -            new_entry_time);
>> +    for_each_sched_unit_vcpu ( unit, v )
>> +        if ( running )
>> +            vcpu_runstate_change(v, v->new_state, new_entry_time);
>> +        else
>> +            vcpu_runstate_change(v,
>> +                ((v->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
>> +                 (vcpu_runnable(v) ? RUNSTATE_runnable : RUNSTATE_offline)),
>> +                new_entry_time);
>>   }
> 
> As mentioned on v2 already, I'm having some difficulty seeing why a
> function like this one (and some of the sched-if.h changes here)
> couldn't be introduced with this loop you add now right away.

I'll have a look (as promised before).

> 
> Seeing this change I'm also puzzled why ->new_state is used only in
> case "running" is true. Is there anything speaking against setting
> that field uniformly, and simply consuming it here in all cases?

There are multiple places where a not running vcpu changes state,
while a vcpu is put to "running" only by scheduling it. So setting
new_state would need to be done in multiple places right the same
way I'm doing the state change here.

> 
>> @@ -1031,10 +1033,9 @@ int cpu_disable_scheduler(unsigned int cpu)
>>               if ( cpumask_empty(&online_affinity) &&
>>                    cpumask_test_cpu(cpu, unit->cpu_hard_affinity) )
>>               {
>> -                /* TODO: multiple vcpus per unit. */
>> -                if ( unit->vcpu_list->affinity_broken )
>> +                if ( sched_check_affinity_broken(unit) )
>>                   {
>> -                    /* The vcpu is temporarily pinned, can't move it. */
>> +                    /* The unit is temporarily pinned, can't move it. */
>>                       unit_schedule_unlock_irqrestore(lock, flags, unit);
> 
> Along these lines, wouldn't this change (and further related ones)
> belong into the patch introducing sched_check_affinity_broken()?

I already agreed on that.

> 
>> @@ -1851,7 +1852,7 @@ void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext)
>>               while ( atomic_read(&next->rendezvous_out_cnt) )
>>                   cpu_relax();
>>       }
>> -    else if ( vprev != vnext )
>> +    else if ( vprev != vnext && sched_granularity == 1 )
>>           context_saved(vprev);
>>   }
> 
> Would you mind helping me with understanding why this call is
> needed with a granularity of 1 only?

Otherwise it is done just a few lines up (granularity 1 will result
in rendezvous_out_cnt being zero).

> 
>> --- a/xen/include/xen/sched-if.h
>> +++ b/xen/include/xen/sched-if.h
>> @@ -68,12 +68,32 @@ static inline bool is_idle_unit(const struct sched_unit *unit)
>>   
>>   static inline bool is_unit_online(const struct sched_unit *unit)
>>   {
>> -    return is_vcpu_online(unit->vcpu_list);
>> +    struct vcpu *v;
> 
> const?

Yes.

> 
>> +    for_each_sched_unit_vcpu ( unit, v )
>> +        if ( is_vcpu_online(v) )
>> +            return true;
>> +
>> +    return false;
>> +}
>> +
>> +static inline unsigned int unit_running(const struct sched_unit *unit)
>> +{
>> +    return unit->runstate_cnt[RUNSTATE_running];
>>   }
> 
> Is there really going to be a user needing the return value be a
> count, not just a boolean?

Yes. See patch 35.

> 
>>   static inline bool unit_runnable(const struct sched_unit *unit)
>>   {
>> -    return vcpu_runnable(unit->vcpu_list);
>> +    struct vcpu *v;
> 
> const?

Yes.

> 
>> +    if ( is_idle_unit(unit) )
>> +        return true;
>> +
>> +    for_each_sched_unit_vcpu ( unit, v )
>> +        if ( vcpu_runnable(v) )
>> +            return true;
> 
> Isn't the loop going to yield true anyway for idle units? If so, is
> there a particular reason for the special casing of idle units up
> front here?

Didn't I explain that before? for_each_sched_unit_vcpu() for an idle
unit might end premature when one of the vcpus is running in another
unit (idle_vcpu->sched_unit != idle_unit).


Juergen
Jan Beulich Sept. 24, 2019, 3 p.m. UTC | #3
On 24.09.2019 16:41, Jürgen Groß wrote:
> On 23.09.19 17:41, Jan Beulich wrote:
>> On 14.09.2019 10:52, Juergen Gross wrote:
>>> @@ -1851,7 +1852,7 @@ void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext)
>>>               while ( atomic_read(&next->rendezvous_out_cnt) )
>>>                   cpu_relax();
>>>       }
>>> -    else if ( vprev != vnext )
>>> +    else if ( vprev != vnext && sched_granularity == 1 )
>>>           context_saved(vprev);
>>>   }
>>
>> Would you mind helping me with understanding why this call is
>> needed with a granularity of 1 only?
> 
> Otherwise it is done just a few lines up (granularity 1 will result
> in rendezvous_out_cnt being zero).

I.e. if is rendezvous_out_cnt is zero and vprev != vnext but
sched_granularity > 1 the call isn't needed? Why? At the end of
the series vcpu_context_saved() gets called in all cases; what's
conditional upon granularity being 1 there is the call to
unit_context_saved().

>>> +    if ( is_idle_unit(unit) )
>>> +        return true;
>>> +
>>> +    for_each_sched_unit_vcpu ( unit, v )
>>> +        if ( vcpu_runnable(v) )
>>> +            return true;
>>
>> Isn't the loop going to yield true anyway for idle units? If so, is
>> there a particular reason for the special casing of idle units up
>> front here?
> 
> Didn't I explain that before?

Quite possible; a good hint that a code comment wouldn't hurt.

> for_each_sched_unit_vcpu() for an idle
> unit might end premature when one of the vcpus is running in another
> unit (idle_vcpu->sched_unit != idle_unit).

Oh, that (v)->sched_unit == (i) in the construct is clearly unexpected.
Is this really still needed by the end of the series? I realize that
_some_ check is needed, but could this perhaps be arranged in a way
that you'd still hit all vCPU-s when using it on an idle unit, no
matter whether they're in use as a substitute in a "real" unit?

As to that construct - why is the parameter named "i" rather than "u"?
And why "e" in for_each_sched_unit()?

Jan
Jürgen Groß Sept. 24, 2019, 3:09 p.m. UTC | #4
On 24.09.19 17:00, Jan Beulich wrote:
> On 24.09.2019 16:41, Jürgen Groß wrote:
>> On 23.09.19 17:41, Jan Beulich wrote:
>>> On 14.09.2019 10:52, Juergen Gross wrote:
>>>> @@ -1851,7 +1852,7 @@ void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext)
>>>>                while ( atomic_read(&next->rendezvous_out_cnt) )
>>>>                    cpu_relax();
>>>>        }
>>>> -    else if ( vprev != vnext )
>>>> +    else if ( vprev != vnext && sched_granularity == 1 )
>>>>            context_saved(vprev);
>>>>    }
>>>
>>> Would you mind helping me with understanding why this call is
>>> needed with a granularity of 1 only?
>>
>> Otherwise it is done just a few lines up (granularity 1 will result
>> in rendezvous_out_cnt being zero).
> 
> I.e. if is rendezvous_out_cnt is zero and vprev != vnext but
> sched_granularity > 1 the call isn't needed? Why? At the end of

I can add an ASSERT() here. This should never happen.

> the series vcpu_context_saved() gets called in all cases; what's
> conditional upon granularity being 1 there is the call to
> unit_context_saved().

vcpu_context_saved() at the end of the series is testing vprev != vnext.

> 
>>>> +    if ( is_idle_unit(unit) )
>>>> +        return true;
>>>> +
>>>> +    for_each_sched_unit_vcpu ( unit, v )
>>>> +        if ( vcpu_runnable(v) )
>>>> +            return true;
>>>
>>> Isn't the loop going to yield true anyway for idle units? If so, is
>>> there a particular reason for the special casing of idle units up
>>> front here?
>>
>> Didn't I explain that before?
> 
> Quite possible; a good hint that a code comment wouldn't hurt.

Okay.

> 
>> for_each_sched_unit_vcpu() for an idle
>> unit might end premature when one of the vcpus is running in another
>> unit (idle_vcpu->sched_unit != idle_unit).
> 
> Oh, that (v)->sched_unit == (i) in the construct is clearly unexpected.
> Is this really still needed by the end of the series? I realize that
> _some_ check is needed, but could this perhaps be arranged in a way
> that you'd still hit all vCPU-s when using it on an idle unit, no
> matter whether they're in use as a substitute in a "real" unit?

I could do that by having another linked list in struct vcpu. This way
I can avoid it.

> As to that construct - why is the parameter named "i" rather than "u"?
> And why "e" in for_each_sched_unit()?

"i" like "item" (somehow this survived the big rename). Can change it.
"e" like "element". I think this is another relic. Can change it, too.


Juergen
Jan Beulich Sept. 24, 2019, 3:22 p.m. UTC | #5
On 24.09.2019 17:09, Jürgen Groß wrote:
> On 24.09.19 17:00, Jan Beulich wrote:
>> On 24.09.2019 16:41, Jürgen Groß wrote:
>>> On 23.09.19 17:41, Jan Beulich wrote:
>>>> On 14.09.2019 10:52, Juergen Gross wrote:
>>>>> @@ -1851,7 +1852,7 @@ void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext)
>>>>>                while ( atomic_read(&next->rendezvous_out_cnt) )
>>>>>                    cpu_relax();
>>>>>        }
>>>>> -    else if ( vprev != vnext )
>>>>> +    else if ( vprev != vnext && sched_granularity == 1 )
>>>>>            context_saved(vprev);
>>>>>    }
>>>>
>>>> Would you mind helping me with understanding why this call is
>>>> needed with a granularity of 1 only?
>>>
>>> Otherwise it is done just a few lines up (granularity 1 will result
>>> in rendezvous_out_cnt being zero).
>>
>> I.e. if is rendezvous_out_cnt is zero and vprev != vnext but
>> sched_granularity > 1 the call isn't needed? Why? At the end of
> 
> I can add an ASSERT() here. This should never happen.

Ah yes, this would address my question.

>> the series vcpu_context_saved() gets called in all cases; what's
>> conditional upon granularity being 1 there is the call to
>> unit_context_saved().
> 
> vcpu_context_saved() at the end of the series is testing vprev != vnext.

But that part of the conditional was not under question.

>>> for_each_sched_unit_vcpu() for an idle
>>> unit might end premature when one of the vcpus is running in another
>>> unit (idle_vcpu->sched_unit != idle_unit).
>>
>> Oh, that (v)->sched_unit == (i) in the construct is clearly unexpected.
>> Is this really still needed by the end of the series? I realize that
>> _some_ check is needed, but could this perhaps be arranged in a way
>> that you'd still hit all vCPU-s when using it on an idle unit, no
>> matter whether they're in use as a substitute in a "real" unit?
> 
> I could do that by having another linked list in struct vcpu. This way
> I can avoid it.

Oh, no, not another list just for this purpose. I was rather thinking
of e.g. a comparison of IDs.

>> As to that construct - why is the parameter named "i" rather than "u"?
>> And why "e" in for_each_sched_unit()?
> 
> "i" like "item" (somehow this survived the big rename). Can change it.
> "e" like "element". I think this is another relic. Can change it, too.

I'd appreciate this, as it would get this more in line with the other
similar macros.

Jan
Jürgen Groß Sept. 25, 2019, 12:40 p.m. UTC | #6
On 24.09.19 17:22, Jan Beulich wrote:
> On 24.09.2019 17:09, Jürgen Groß wrote:
>> On 24.09.19 17:00, Jan Beulich wrote:
>>> On 24.09.2019 16:41, Jürgen Groß wrote:
>>>> for_each_sched_unit_vcpu() for an idle
>>>> unit might end premature when one of the vcpus is running in another
>>>> unit (idle_vcpu->sched_unit != idle_unit).
>>>
>>> Oh, that (v)->sched_unit == (i) in the construct is clearly unexpected.
>>> Is this really still needed by the end of the series? I realize that
>>> _some_ check is needed, but could this perhaps be arranged in a way
>>> that you'd still hit all vCPU-s when using it on an idle unit, no
>>> matter whether they're in use as a substitute in a "real" unit?
>>
>> I could do that by having another linked list in struct vcpu. This way
>> I can avoid it.
> 
> Oh, no, not another list just for this purpose. I was rather thinking
> of e.g. a comparison of IDs.

That would result either in something like:

(v)->vcpu_id < (u)->unit_id + (u)->res->cpupool->granularity

requiring to make struct sched_resource public as keyhandler.c needs
for_each_sched_unit_vcpu() plus being quite expensive, or:

!(u)->next_in_list || (v)->vcpu_id < (u)->next_in_list->unit_id

which seems to be more expensive as the current variant, too.


Juergen
Jan Beulich Sept. 25, 2019, 1:07 p.m. UTC | #7
On 25.09.2019 14:40, Jürgen Groß  wrote:
> On 24.09.19 17:22, Jan Beulich wrote:
>> On 24.09.2019 17:09, Jürgen Groß wrote:
>>> On 24.09.19 17:00, Jan Beulich wrote:
>>>> On 24.09.2019 16:41, Jürgen Groß wrote:
>>>>> for_each_sched_unit_vcpu() for an idle
>>>>> unit might end premature when one of the vcpus is running in another
>>>>> unit (idle_vcpu->sched_unit != idle_unit).
>>>>
>>>> Oh, that (v)->sched_unit == (i) in the construct is clearly unexpected.
>>>> Is this really still needed by the end of the series? I realize that
>>>> _some_ check is needed, but could this perhaps be arranged in a way
>>>> that you'd still hit all vCPU-s when using it on an idle unit, no
>>>> matter whether they're in use as a substitute in a "real" unit?
>>>
>>> I could do that by having another linked list in struct vcpu. This way
>>> I can avoid it.
>>
>> Oh, no, not another list just for this purpose. I was rather thinking
>> of e.g. a comparison of IDs.
> 
> That would result either in something like:
> 
> (v)->vcpu_id < (u)->unit_id + (u)->res->cpupool->granularity
> 
> requiring to make struct sched_resource public as keyhandler.c needs
> for_each_sched_unit_vcpu() plus being quite expensive,

I agree this is not a good option.

> or:
> 
> !(u)->next_in_list || (v)->vcpu_id < (u)->next_in_list->unit_id
> 
> which seems to be more expensive as the current variant, too.

It's not this much more expensive, and it eliminates unexpected
(as I would call it) behavior, so I think I'd go this route. Let's
see if others, in particular Dario or George, have an opinion
either way.

Jan
Dario Faggioli Sept. 26, 2019, 1:53 p.m. UTC | #8
On Wed, 2019-09-25 at 15:07 +0200, Jan Beulich wrote:
> On 25.09.2019 14:40, Jürgen Groß  wrote:
> > On 24.09.19 17:22, Jan Beulich wrote:
> > > On 24.09.2019 17:09, Jürgen Groß wrote:
> > > > On 24.09.19 17:00, Jan Beulich wrote:
> > > > > On 24.09.2019 16:41, Jürgen Groß wrote:
> > > > > > for_each_sched_unit_vcpu() for an idle
> > > > > > unit might end premature when one of the vcpus is running
> > > > > > in another
> > > > > > unit (idle_vcpu->sched_unit != idle_unit).
> > > > > 
> > > > > Oh, that (v)->sched_unit == (i) in the construct is clearly
> > > > > unexpected.
> > > > > Is this really still needed by the end of the series? I
> > > > > realize that
> > > > > _some_ check is needed, but could this perhaps be arranged in
> > > > > a way
> > > > > that you'd still hit all vCPU-s when using it on an idle
> > > > > unit, no
> > > > > matter whether they're in use as a substitute in a "real"
> > > > > unit?
> > > > 
> > > > I could do that by having another linked list in struct vcpu.
> > > > This way
> > > > I can avoid it.
> > > 
> > > Oh, no, not another list just for this purpose. I was rather
> > > thinking
> > > of e.g. a comparison of IDs.
> > 
> > That would result either in something like:
> > 
> > (v)->vcpu_id < (u)->unit_id + (u)->res->cpupool->granularity
> > 
> > requiring to make struct sched_resource public as keyhandler.c
> > needs
> > for_each_sched_unit_vcpu() plus being quite expensive,
> 
> I agree this is not a good option.
> 
> > or:
> > 
> > !(u)->next_in_list || (v)->vcpu_id < (u)->next_in_list->unit_id
> > 
> > which seems to be more expensive as the current variant, too.
> 
> It's not this much more expensive, and it eliminates unexpected
> (as I would call it) behavior, so I think I'd go this route. 
>
So, I honestly like the way it's currently done in Juergen's pathes.

However, I'm not sure I understand what it is the issue that Jan thinks
that has, and in what sense the code/behavior is regarded as
"unexpected".

Can you help me see the problem? Maybe, if I realize it, I'd change my
preference...

Thanks and Regards
Jan Beulich Sept. 26, 2019, 2:24 p.m. UTC | #9
On 26.09.2019 15:53, Dario Faggioli wrote:
> On Wed, 2019-09-25 at 15:07 +0200, Jan Beulich wrote:
>> On 25.09.2019 14:40, Jürgen Groß  wrote:
>>> On 24.09.19 17:22, Jan Beulich wrote:
>>>> On 24.09.2019 17:09, Jürgen Groß wrote:
>>>>> On 24.09.19 17:00, Jan Beulich wrote:
>>>>>> On 24.09.2019 16:41, Jürgen Groß wrote:
>>>>>>> for_each_sched_unit_vcpu() for an idle
>>>>>>> unit might end premature when one of the vcpus is running
>>>>>>> in another
>>>>>>> unit (idle_vcpu->sched_unit != idle_unit).
>>>>>>
>>>>>> Oh, that (v)->sched_unit == (i) in the construct is clearly
>>>>>> unexpected.
>>>>>> Is this really still needed by the end of the series? I
>>>>>> realize that
>>>>>> _some_ check is needed, but could this perhaps be arranged in
>>>>>> a way
>>>>>> that you'd still hit all vCPU-s when using it on an idle
>>>>>> unit, no
>>>>>> matter whether they're in use as a substitute in a "real"
>>>>>> unit?
>>>>>
>>>>> I could do that by having another linked list in struct vcpu.
>>>>> This way
>>>>> I can avoid it.
>>>>
>>>> Oh, no, not another list just for this purpose. I was rather
>>>> thinking
>>>> of e.g. a comparison of IDs.
>>>
>>> That would result either in something like:
>>>
>>> (v)->vcpu_id < (u)->unit_id + (u)->res->cpupool->granularity
>>>
>>> requiring to make struct sched_resource public as keyhandler.c
>>> needs
>>> for_each_sched_unit_vcpu() plus being quite expensive,
>>
>> I agree this is not a good option.
>>
>>> or:
>>>
>>> !(u)->next_in_list || (v)->vcpu_id < (u)->next_in_list->unit_id
>>>
>>> which seems to be more expensive as the current variant, too.
>>
>> It's not this much more expensive, and it eliminates unexpected
>> (as I would call it) behavior, so I think I'd go this route. 
>>
> So, I honestly like the way it's currently done in Juergen's pathes.
> 
> However, I'm not sure I understand what it is the issue that Jan thinks
> that has, and in what sense the code/behavior is regarded as
> "unexpected".
> 
> Can you help me see the problem? Maybe, if I realize it, I'd change my
> preference...

The unexpected / surprising behavior is described at the top (i.e.
still visible in context), but I'll quote it again here:

"for_each_sched_unit_vcpu() for an idle unit might end premature
 when one of the vcpus is running in another unit
 (idle_vcpu->sched_unit != idle_unit)"

This started out with me asking about an apparently (but as
Jürgen has clarified not truly) unnecessary special casing of
idle vCPU-s/units/domain ahead of a use of this construct.

Jan
Jürgen Groß Sept. 26, 2019, 2:40 p.m. UTC | #10
On 26.09.19 15:53, Dario Faggioli wrote:
> On Wed, 2019-09-25 at 15:07 +0200, Jan Beulich wrote:
>> On 25.09.2019 14:40, Jürgen Groß  wrote:
>>> On 24.09.19 17:22, Jan Beulich wrote:
>>>> On 24.09.2019 17:09, Jürgen Groß wrote:
>>>>> On 24.09.19 17:00, Jan Beulich wrote:
>>>>>> On 24.09.2019 16:41, Jürgen Groß wrote:
>>>>>>> for_each_sched_unit_vcpu() for an idle
>>>>>>> unit might end premature when one of the vcpus is running
>>>>>>> in another
>>>>>>> unit (idle_vcpu->sched_unit != idle_unit).
>>>>>>
>>>>>> Oh, that (v)->sched_unit == (i) in the construct is clearly
>>>>>> unexpected.
>>>>>> Is this really still needed by the end of the series? I
>>>>>> realize that
>>>>>> _some_ check is needed, but could this perhaps be arranged in
>>>>>> a way
>>>>>> that you'd still hit all vCPU-s when using it on an idle
>>>>>> unit, no
>>>>>> matter whether they're in use as a substitute in a "real"
>>>>>> unit?
>>>>>
>>>>> I could do that by having another linked list in struct vcpu.
>>>>> This way
>>>>> I can avoid it.
>>>>
>>>> Oh, no, not another list just for this purpose. I was rather
>>>> thinking
>>>> of e.g. a comparison of IDs.
>>>
>>> That would result either in something like:
>>>
>>> (v)->vcpu_id < (u)->unit_id + (u)->res->cpupool->granularity
>>>
>>> requiring to make struct sched_resource public as keyhandler.c
>>> needs
>>> for_each_sched_unit_vcpu() plus being quite expensive,
>>
>> I agree this is not a good option.
>>
>>> or:
>>>
>>> !(u)->next_in_list || (v)->vcpu_id < (u)->next_in_list->unit_id
>>>
>>> which seems to be more expensive as the current variant, too.
>>
>> It's not this much more expensive, and it eliminates unexpected
>> (as I would call it) behavior, so I think I'd go this route.
>>
> So, I honestly like the way it's currently done in Juergen's pathes.
> 
> However, I'm not sure I understand what it is the issue that Jan thinks
> that has, and in what sense the code/behavior is regarded as
> "unexpected".
> 
> Can you help me see the problem? Maybe, if I realize it, I'd change my
> preference...

I have changed it meanwhile and I think the new solution removes a
latent problem. Otherwise one would have to be very careful not to use
for_each_sched_unit_vcpu() for idle units, as this might result in
occasional wrong results.


Juergen
Dario Faggioli Sept. 26, 2019, 4:41 p.m. UTC | #11
On Thu, 2019-09-26 at 16:40 +0200, Jürgen Groß wrote:
> On 26.09.19 15:53, Dario Faggioli wrote:
> > However, I'm not sure I understand what it is the issue that Jan
> > thinks
> > that has, and in what sense the code/behavior is regarded as
> > "unexpected".
> > 
> > Can you help me see the problem? Maybe, if I realize it, I'd change
> > my
> > preference...
> 
> I have changed it meanwhile and I think the new solution removes a
> latent problem. 
>
Ok, I'll comment directly on the new shape of it then, I guess.

Regards
diff mbox series

Patch

diff --git a/xen/common/domain.c b/xen/common/domain.c
index fa4023936b..ea6aee3858 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1259,7 +1259,10 @@  int vcpu_reset(struct vcpu *v)
     v->async_exception_mask = 0;
     memset(v->async_exception_state, 0, sizeof(v->async_exception_state));
 #endif
-    v->affinity_broken = 0;
+    if ( v->affinity_broken & VCPU_AFFINITY_OVERRIDE )
+        vcpu_temporary_affinity(v, NR_CPUS, VCPU_AFFINITY_OVERRIDE);
+    if ( v->affinity_broken & VCPU_AFFINITY_WAIT )
+        vcpu_temporary_affinity(v, NR_CPUS, VCPU_AFFINITY_WAIT);
     clear_bit(_VPF_blocked, &v->pause_flags);
     clear_bit(_VPF_in_reset, &v->pause_flags);
 
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 03bcf796ae..a79065c826 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -243,8 +243,9 @@  static inline void vcpu_runstate_change(
     s_time_t delta;
     struct sched_unit *unit = v->sched_unit;
 
-    ASSERT(v->runstate.state != new_state);
     ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
+    if ( v->runstate.state == new_state )
+        return;
 
     vcpu_urgent_count_update(v);
 
@@ -266,15 +267,16 @@  static inline void vcpu_runstate_change(
 static inline void sched_unit_runstate_change(struct sched_unit *unit,
     bool running, s_time_t new_entry_time)
 {
-    struct vcpu *v = unit->vcpu_list;
+    struct vcpu *v;
 
-    if ( running )
-        vcpu_runstate_change(v, v->new_state, new_entry_time);
-    else
-        vcpu_runstate_change(v,
-            ((v->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
-             (vcpu_runnable(v) ? RUNSTATE_runnable : RUNSTATE_offline)),
-            new_entry_time);
+    for_each_sched_unit_vcpu ( unit, v )
+        if ( running )
+            vcpu_runstate_change(v, v->new_state, new_entry_time);
+        else
+            vcpu_runstate_change(v,
+                ((v->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
+                 (vcpu_runnable(v) ? RUNSTATE_runnable : RUNSTATE_offline)),
+                new_entry_time);
 }
 
 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
@@ -1031,10 +1033,9 @@  int cpu_disable_scheduler(unsigned int cpu)
             if ( cpumask_empty(&online_affinity) &&
                  cpumask_test_cpu(cpu, unit->cpu_hard_affinity) )
             {
-                /* TODO: multiple vcpus per unit. */
-                if ( unit->vcpu_list->affinity_broken )
+                if ( sched_check_affinity_broken(unit) )
                 {
-                    /* The vcpu is temporarily pinned, can't move it. */
+                    /* The unit is temporarily pinned, can't move it. */
                     unit_schedule_unlock_irqrestore(lock, flags, unit);
                     ret = -EADDRINUSE;
                     break;
@@ -1392,17 +1393,17 @@  int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason)
             ret = 0;
             v->affinity_broken &= ~reason;
         }
-        if ( !ret && !v->affinity_broken )
+        if ( !ret && !sched_check_affinity_broken(unit) )
             sched_set_affinity(v, unit->cpu_hard_affinity_saved, NULL);
     }
     else if ( cpu < nr_cpu_ids )
     {
         if ( (v->affinity_broken & reason) ||
-             (v->affinity_broken && v->processor != cpu) )
+             (sched_check_affinity_broken(unit) && v->processor != cpu) )
             ret = -EBUSY;
         else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) )
         {
-            if ( !v->affinity_broken )
+            if ( !sched_check_affinity_broken(unit) )
             {
                 cpumask_copy(unit->cpu_hard_affinity_saved,
                              unit->cpu_hard_affinity);
@@ -1722,14 +1723,14 @@  static void sched_switch_units(struct sched_resource *sd,
              (next->vcpu_list->runstate.state == RUNSTATE_runnable) ?
              (now - next->state_entry_time) : 0, prev->next_time);
 
-    ASSERT(prev->vcpu_list->runstate.state == RUNSTATE_running);
+    ASSERT(unit_running(prev));
 
     TRACE_4D(TRC_SCHED_SWITCH, prev->domain->domain_id, prev->unit_id,
              next->domain->domain_id, next->unit_id);
 
     sched_unit_runstate_change(prev, false, now);
 
-    ASSERT(next->vcpu_list->runstate.state != RUNSTATE_running);
+    ASSERT(!unit_running(next));
     sched_unit_runstate_change(next, true, now);
 
     /*
@@ -1851,7 +1852,7 @@  void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext)
             while ( atomic_read(&next->rendezvous_out_cnt) )
                 cpu_relax();
     }
-    else if ( vprev != vnext )
+    else if ( vprev != vnext && sched_granularity == 1 )
         context_saved(vprev);
 }
 
diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h
index 25ba6f25c9..6a4dbac935 100644
--- a/xen/include/xen/sched-if.h
+++ b/xen/include/xen/sched-if.h
@@ -68,12 +68,32 @@  static inline bool is_idle_unit(const struct sched_unit *unit)
 
 static inline bool is_unit_online(const struct sched_unit *unit)
 {
-    return is_vcpu_online(unit->vcpu_list);
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        if ( is_vcpu_online(v) )
+            return true;
+
+    return false;
+}
+
+static inline unsigned int unit_running(const struct sched_unit *unit)
+{
+    return unit->runstate_cnt[RUNSTATE_running];
 }
 
 static inline bool unit_runnable(const struct sched_unit *unit)
 {
-    return vcpu_runnable(unit->vcpu_list);
+    struct vcpu *v;
+
+    if ( is_idle_unit(unit) )
+        return true;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        if ( vcpu_runnable(v) )
+            return true;
+
+    return false;
 }
 
 static inline bool unit_runnable_state(const struct sched_unit *unit)
@@ -102,7 +122,16 @@  static inline bool unit_runnable_state(const struct sched_unit *unit)
 static inline void sched_set_res(struct sched_unit *unit,
                                  struct sched_resource *res)
 {
-    unit->vcpu_list->processor = res->master_cpu;
+    unsigned int cpu = cpumask_first(res->cpus);
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+    {
+        ASSERT(cpu < nr_cpu_ids);
+        v->processor = cpu;
+        cpu = cpumask_next(cpu, res->cpus);
+    }
+
     unit->res = res;
 }
 
@@ -114,25 +143,37 @@  static inline unsigned int sched_unit_cpu(const struct sched_unit *unit)
 static inline void sched_set_pause_flags(struct sched_unit *unit,
                                          unsigned int bit)
 {
-    __set_bit(bit, &unit->vcpu_list->pause_flags);
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        __set_bit(bit, &v->pause_flags);
 }
 
 static inline void sched_clear_pause_flags(struct sched_unit *unit,
                                            unsigned int bit)
 {
-    __clear_bit(bit, &unit->vcpu_list->pause_flags);
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        __clear_bit(bit, &v->pause_flags);
 }
 
 static inline void sched_set_pause_flags_atomic(struct sched_unit *unit,
                                                 unsigned int bit)
 {
-    set_bit(bit, &unit->vcpu_list->pause_flags);
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        set_bit(bit, &v->pause_flags);
 }
 
 static inline void sched_clear_pause_flags_atomic(struct sched_unit *unit,
                                                   unsigned int bit)
 {
-    clear_bit(bit, &unit->vcpu_list->pause_flags);
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        clear_bit(bit, &v->pause_flags);
 }
 
 static inline struct sched_unit *sched_idle_unit(unsigned int cpu)
@@ -458,12 +499,18 @@  static inline int sched_adjust_cpupool(const struct scheduler *s,
 
 static inline void sched_unit_pause_nosync(struct sched_unit *unit)
 {
-    vcpu_pause_nosync(unit->vcpu_list);
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        vcpu_pause_nosync(v);
 }
 
 static inline void sched_unit_unpause(struct sched_unit *unit)
 {
-    vcpu_unpause(unit->vcpu_list);
+    struct vcpu *v;
+
+    for_each_sched_unit_vcpu ( unit, v )
+        vcpu_unpause(v);
 }
 
 #define REGISTER_SCHEDULER(x) static const struct scheduler *x##_entry \