diff mbox series

[v2,30/48] xen/sched: introduce unit_runnable_state()

Message ID 20190809145833.1020-31-jgross@suse.com (mailing list archive)
State Superseded
Headers show
Series xen: add core scheduling support | expand

Commit Message

Jürgen Groß Aug. 9, 2019, 2:58 p.m. UTC
Today the vcpu runstate of a new scheduled vcpu is always set to
"running" even if at that time vcpu_runnable() is already returning
false due to a race (e.g. with pausing the vcpu).

With core scheduling this can no longer work as not all vcpus of a
schedule unit have to be "running" when being scheduled. So the vcpu's
new runstate has to be selected at the same time as the runnability of
the related schedule unit is probed.

For this purpose introduce a new helper unit_runnable_state() which
will save the new runstate of all tested vcpus in a new field of the
vcpu struct.

Signed-off-by: Juergen Gross <jgross@suse.com>
---
RFC V2: new patch
---
 xen/common/domain.c         |  1 +
 xen/common/sched_arinc653.c |  2 +-
 xen/common/sched_credit.c   | 49 ++++++++++++++++++++++++---------------------
 xen/common/sched_credit2.c  |  7 ++++---
 xen/common/sched_null.c     |  3 ++-
 xen/common/sched_rt.c       |  8 +++++++-
 xen/common/schedule.c       |  2 +-
 xen/include/xen/sched-if.h  | 14 +++++++++++++
 xen/include/xen/sched.h     |  1 +
 9 files changed, 57 insertions(+), 30 deletions(-)

Comments

Jan Beulich Sept. 11, 2019, 10:30 a.m. UTC | #1
On 09.08.2019 16:58, Juergen Gross wrote:
> --- a/xen/common/schedule.c
> +++ b/xen/common/schedule.c
> @@ -266,7 +266,7 @@ static inline void sched_unit_runstate_change(struct sched_unit *unit,
>      struct vcpu *v = unit->vcpu_list;
>  
>      if ( running )
> -        vcpu_runstate_change(v, RUNSTATE_running, new_entry_time);
> +        vcpu_runstate_change(v, v->new_state, new_entry_time);

Strictly speaking this is wrong when there's no actual state
change, as the state entry time then shouldn't change. Quite
possibly this would be merely a cosmetic issue though.

> --- a/xen/include/xen/sched-if.h
> +++ b/xen/include/xen/sched-if.h
> @@ -75,6 +75,20 @@ static inline bool unit_runnable(const struct sched_unit *unit)
>      return vcpu_runnable(unit->vcpu_list);
>  }
>  
> +static inline bool unit_runnable_state(const struct sched_unit *unit)
> +{
> +    struct vcpu *v;
> +    bool runnable;
> +
> +    v = unit->vcpu_list;
> +    runnable = vcpu_runnable(v);
> +
> +    v->new_state = runnable ? RUNSTATE_running
> +                            : (v->pause_flags & VPF_blocked)
> +                              ? RUNSTATE_blocked : RUNSTATE_offline;
> +    return runnable;
> +}

Especially for understanding the (correctness of the) credit1
changes it would be rather helpful if once again this function
actually iterated over all vCPU-s right away (even if there's
only one per unit right now), to see how their varying states
get combined.

> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -174,6 +174,7 @@ struct vcpu
>          XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
>      } runstate_guest; /* guest address */
>  #endif
> +    int              new_state;

I realize its counterpart (wrongly) is plain int in the public
interface - I think it should be unsigned int here and uint32_t
there. I'm pondering whether to do a swipe across all public
headers to replace all uses of plain int (and alike) with
fixed width types.

Jan
Dario Faggioli Sept. 12, 2019, 10:22 a.m. UTC | #2
On Wed, 2019-09-11 at 12:30 +0200, Jan Beulich wrote:
> On 09.08.2019 16:58, Juergen Gross wrote:
> > 
> > --- a/xen/include/xen/sched-if.h
> > +++ b/xen/include/xen/sched-if.h
> > @@ -75,6 +75,20 @@ static inline bool unit_runnable(const struct
> > sched_unit *unit)
> >      return vcpu_runnable(unit->vcpu_list);
> >  }
> >  
> > +static inline bool unit_runnable_state(const struct sched_unit
> > *unit)
> > +{
> > +    struct vcpu *v;
> > +    bool runnable;
> > +
> > +    v = unit->vcpu_list;
> > +    runnable = vcpu_runnable(v);
> > +
> > +    v->new_state = runnable ? RUNSTATE_running
> > +                            : (v->pause_flags & VPF_blocked)
> > +                              ? RUNSTATE_blocked :
> > RUNSTATE_offline;
> > +    return runnable;
> > +}
> 
> Especially for understanding the (correctness of the) credit1
> changes it would be rather helpful if once again this function
> actually iterated over all vCPU-s right away (even if there's
> only one per unit right now), to see how their varying states
> get combined.
> 
Yep, I agree.

Regards
Dario Faggioli Sept. 12, 2019, 10:24 a.m. UTC | #3
On Fri, 2019-08-09 at 16:58 +0200, Juergen Gross wrote:
> Today the vcpu runstate of a new scheduled vcpu is always set to
> "running" even if at that time vcpu_runnable() is already returning
> false due to a race (e.g. with pausing the vcpu).
> 
> With core scheduling this can no longer work as not all vcpus of a
> schedule unit have to be "running" when being scheduled. So the
> vcpu's
> new runstate has to be selected at the same time as the runnability
> of
> the related schedule unit is probed.
> 
So, you're saying that this patch is meant at fixing a race. A race
that, if I understood correctly, it's not there/it's not problematic
right now, but will be when we have more than 1 vcpu in a unit and we
enable core scheduling. Is this the case?

If yes:
- this very patch, at this point in the series, is basically
introducing no functional (or at least behavioral) change, is this
right too?
- can you provide some more detail about the race. When/how it occurs
and how the changes done in credit and rt fix it?

Thanks and Regards
Jürgen Groß Sept. 13, 2019, 2:07 p.m. UTC | #4
On 11.09.19 12:30, Jan Beulich wrote:
> On 09.08.2019 16:58, Juergen Gross wrote:
>> --- a/xen/common/schedule.c
>> +++ b/xen/common/schedule.c
>> @@ -266,7 +266,7 @@ static inline void sched_unit_runstate_change(struct sched_unit *unit,
>>       struct vcpu *v = unit->vcpu_list;
>>   
>>       if ( running )
>> -        vcpu_runstate_change(v, RUNSTATE_running, new_entry_time);
>> +        vcpu_runstate_change(v, v->new_state, new_entry_time);
> 
> Strictly speaking this is wrong when there's no actual state
> change, as the state entry time then shouldn't change. Quite
> possibly this would be merely a cosmetic issue though.

This will be changed in vcpu_runstate_change() with patch 31 when this
situation is actually possible. With only one vcpu in a unit the state
will always change here, while after the next patch
vcpu_runstate_change() will return early in case the state isn't
changing.

> 
>> --- a/xen/include/xen/sched-if.h
>> +++ b/xen/include/xen/sched-if.h
>> @@ -75,6 +75,20 @@ static inline bool unit_runnable(const struct sched_unit *unit)
>>       return vcpu_runnable(unit->vcpu_list);
>>   }
>>   
>> +static inline bool unit_runnable_state(const struct sched_unit *unit)
>> +{
>> +    struct vcpu *v;
>> +    bool runnable;
>> +
>> +    v = unit->vcpu_list;
>> +    runnable = vcpu_runnable(v);
>> +
>> +    v->new_state = runnable ? RUNSTATE_running
>> +                            : (v->pause_flags & VPF_blocked)
>> +                              ? RUNSTATE_blocked : RUNSTATE_offline;
>> +    return runnable;
>> +}
> 
> Especially for understanding the (correctness of the) credit1
> changes it would be rather helpful if once again this function
> actually iterated over all vCPU-s right away (even if there's
> only one per unit right now), to see how their varying states
> get combined.

Okay, will move it.

> 
>> --- a/xen/include/xen/sched.h
>> +++ b/xen/include/xen/sched.h
>> @@ -174,6 +174,7 @@ struct vcpu
>>           XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
>>       } runstate_guest; /* guest address */
>>   #endif
>> +    int              new_state;
> 
> I realize its counterpart (wrongly) is plain int in the public
> interface - I think it should be unsigned int here and uint32_t
> there. I'm pondering whether to do a swipe across all public
> headers to replace all uses of plain int (and alike) with
> fixed width types.

The list for cleanups is becoming longer...

So are you fine with me not changing anything in this regard right now?


Juergen
Jürgen Groß Sept. 13, 2019, 2:14 p.m. UTC | #5
On 12.09.19 12:24, Dario Faggioli wrote:
> On Fri, 2019-08-09 at 16:58 +0200, Juergen Gross wrote:
>> Today the vcpu runstate of a new scheduled vcpu is always set to
>> "running" even if at that time vcpu_runnable() is already returning
>> false due to a race (e.g. with pausing the vcpu).
>>
>> With core scheduling this can no longer work as not all vcpus of a
>> schedule unit have to be "running" when being scheduled. So the
>> vcpu's
>> new runstate has to be selected at the same time as the runnability
>> of
>> the related schedule unit is probed.
>>
> So, you're saying that this patch is meant at fixing a race. A race
> that, if I understood correctly, it's not there/it's not problematic
> right now, but will be when we have more than 1 vcpu in a unit and we
> enable core scheduling. Is this the case?

Correct.

> 
> If yes:
> - this very patch, at this point in the series, is basically
> introducing no functional (or at least behavioral) change, is this
> right too?

Yes.

> - can you provide some more detail about the race. When/how it occurs
> and how the changes done in credit and rt fix it?

The problem is that with more than one vcpu in a unit you can no longer
tell the new run_state from the mere fact that the unit has been
selected to run next. So testing e.g. vcpu_runnable() after selecting
a unit for running could result in all vcpus not being runnable any
longer (e.g. when a domain was paused via domain_pause_nosync()
after selecting the next unit, but before setting any vcpu's run_state).

In order to avoid that problem the new run_state of the vcpus of a unit
needs to be determined when doing the decision which unit to select.


Juergen
Jan Beulich Sept. 13, 2019, 2:44 p.m. UTC | #6
On 13.09.2019 16:07, Juergen Gross wrote:
> On 11.09.19 12:30, Jan Beulich wrote:
>> On 09.08.2019 16:58, Juergen Gross wrote:
>>> --- a/xen/include/xen/sched.h
>>> +++ b/xen/include/xen/sched.h
>>> @@ -174,6 +174,7 @@ struct vcpu
>>>           XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
>>>       } runstate_guest; /* guest address */
>>>   #endif
>>> +    int              new_state;
>>
>> I realize its counterpart (wrongly) is plain int in the public
>> interface - I think it should be unsigned int here and uint32_t
>> there. I'm pondering whether to do a swipe across all public
>> headers to replace all uses of plain int (and alike) with
>> fixed width types.
> 
> The list for cleanups is becoming longer...
> 
> So are you fine with me not changing anything in this regard right now?

Well, it wouldn't be bad if the new field was of appropriate type.
But given the state of things elsewhere I wouldn't insist.

Jan
Jürgen Groß Sept. 13, 2019, 3:23 p.m. UTC | #7
On 13.09.19 16:44, Jan Beulich wrote:
> On 13.09.2019 16:07, Juergen Gross wrote:
>> On 11.09.19 12:30, Jan Beulich wrote:
>>> On 09.08.2019 16:58, Juergen Gross wrote:
>>>> --- a/xen/include/xen/sched.h
>>>> +++ b/xen/include/xen/sched.h
>>>> @@ -174,6 +174,7 @@ struct vcpu
>>>>            XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
>>>>        } runstate_guest; /* guest address */
>>>>    #endif
>>>> +    int              new_state;
>>>
>>> I realize its counterpart (wrongly) is plain int in the public
>>> interface - I think it should be unsigned int here and uint32_t
>>> there. I'm pondering whether to do a swipe across all public
>>> headers to replace all uses of plain int (and alike) with
>>> fixed width types.
>>
>> The list for cleanups is becoming longer...
>>
>> So are you fine with me not changing anything in this regard right now?
> 
> Well, it wouldn't be bad if the new field was of appropriate type.

Okay, changed it.


Juergen
diff mbox series

Patch

diff --git a/xen/common/domain.c b/xen/common/domain.c
index 863b7cae35..3797f954f5 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -157,6 +157,7 @@  struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
     if ( is_idle_domain(d) )
     {
         v->runstate.state = RUNSTATE_running;
+        v->new_state = RUNSTATE_running;
     }
     else
     {
diff --git a/xen/common/sched_arinc653.c b/xen/common/sched_arinc653.c
index 34efcc07c9..759c42ab73 100644
--- a/xen/common/sched_arinc653.c
+++ b/xen/common/sched_arinc653.c
@@ -557,7 +557,7 @@  a653sched_do_schedule(
     if ( !((new_task != NULL)
            && (AUNIT(new_task) != NULL)
            && AUNIT(new_task)->awake
-           && unit_runnable(new_task)) )
+           && unit_runnable_state(new_task)) )
         new_task = IDLETASK(cpu);
     BUG_ON(new_task == NULL);
 
diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
index f1675fd52e..aeb659577e 100644
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -1894,7 +1894,7 @@  static void csched_schedule(
     if ( !test_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags)
          && !tasklet_work_scheduled
          && prv->ratelimit
-         && unit_runnable(unit)
+         && unit_runnable_state(unit)
          && !is_idle_unit(unit)
          && runtime < prv->ratelimit )
     {
@@ -1939,33 +1939,36 @@  static void csched_schedule(
         dec_nr_runnable(sched_cpu);
     }
 
-    snext = __runq_elem(runq->next);
-
-    /* Tasklet work (which runs in idle UNIT context) overrides all else. */
-    if ( tasklet_work_scheduled )
-    {
-        TRACE_0D(TRC_CSCHED_SCHED_TASKLET);
-        snext = CSCHED_UNIT(sched_idle_unit(sched_cpu));
-        snext->pri = CSCHED_PRI_TS_BOOST;
-    }
-
     /*
      * Clear YIELD flag before scheduling out
      */
     clear_bit(CSCHED_FLAG_UNIT_YIELD, &scurr->flags);
 
-    /*
-     * SMP Load balance:
-     *
-     * If the next highest priority local runnable UNIT has already eaten
-     * through its credits, look on other PCPUs to see if we have more
-     * urgent work... If not, csched_load_balance() will return snext, but
-     * already removed from the runq.
-     */
-    if ( snext->pri > CSCHED_PRI_TS_OVER )
-        __runq_remove(snext);
-    else
-        snext = csched_load_balance(prv, sched_cpu, snext, &migrated);
+    do {
+        snext = __runq_elem(runq->next);
+
+        /* Tasklet work (which runs in idle UNIT context) overrides all else. */
+        if ( tasklet_work_scheduled )
+        {
+            TRACE_0D(TRC_CSCHED_SCHED_TASKLET);
+            snext = CSCHED_UNIT(sched_idle_unit(sched_cpu));
+            snext->pri = CSCHED_PRI_TS_BOOST;
+        }
+
+        /*
+         * SMP Load balance:
+         *
+         * If the next highest priority local runnable UNIT has already eaten
+         * through its credits, look on other PCPUs to see if we have more
+         * urgent work... If not, csched_load_balance() will return snext, but
+         * already removed from the runq.
+         */
+        if ( snext->pri > CSCHED_PRI_TS_OVER )
+            __runq_remove(snext);
+        else
+            snext = csched_load_balance(prv, sched_cpu, snext, &migrated);
+
+    } while ( !unit_runnable_state(snext->unit) );
 
     /*
      * Update idlers mask if necessary. When we're idling, other CPUs
diff --git a/xen/common/sched_credit2.c b/xen/common/sched_credit2.c
index 98ef48d6f4..af06d02056 100644
--- a/xen/common/sched_credit2.c
+++ b/xen/common/sched_credit2.c
@@ -3289,7 +3289,7 @@  runq_candidate(struct csched2_runqueue_data *rqd,
      * In fact, it may be the case that scurr is about to spin, and there's
      * no point forcing it to do so until rate limiting expires.
      */
-    if ( !yield && prv->ratelimit_us && unit_runnable(scurr->unit) &&
+    if ( !yield && prv->ratelimit_us && unit_runnable_state(scurr->unit) &&
          (now - scurr->unit->state_entry_time) < MICROSECS(prv->ratelimit_us) )
     {
         if ( unlikely(tb_init_done) )
@@ -3343,7 +3343,7 @@  runq_candidate(struct csched2_runqueue_data *rqd,
      *
      * Of course, we also default to idle also if scurr is not runnable.
      */
-    if ( unit_runnable(scurr->unit) && !soft_aff_preempt )
+    if ( unit_runnable_state(scurr->unit) && !soft_aff_preempt )
         snext = scurr;
     else
         snext = csched2_unit(sched_idle_unit(cpu));
@@ -3403,7 +3403,8 @@  runq_candidate(struct csched2_runqueue_data *rqd,
          * some budget, then choose it.
          */
         if ( (yield || svc->credit > snext->credit) &&
-             (!has_cap(svc) || unit_grab_budget(svc)) )
+             (!has_cap(svc) || unit_grab_budget(svc)) &&
+             unit_runnable_state(svc->unit) )
             snext = svc;
 
         /* In any case, if we got this far, break. */
diff --git a/xen/common/sched_null.c b/xen/common/sched_null.c
index 397edcbc83..755d3e448c 100644
--- a/xen/common/sched_null.c
+++ b/xen/common/sched_null.c
@@ -864,7 +864,8 @@  static void null_schedule(const struct scheduler *ops, struct sched_unit *prev,
             cpumask_set_cpu(cpu, &prv->cpus_free);
     }
 
-    if ( unlikely(prev->next_task == NULL || !unit_runnable(prev->next_task)) )
+    if ( unlikely(prev->next_task == NULL ||
+                  !unit_runnable_state(prev->next_task)) )
         prev->next_task = sched_idle_unit(sched_cpu);
 
     NULL_UNIT_CHECK(prev->next_task);
diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c
index fcbfa528f4..205ff13c09 100644
--- a/xen/common/sched_rt.c
+++ b/xen/common/sched_rt.c
@@ -1093,12 +1093,18 @@  rt_schedule(const struct scheduler *ops, struct sched_unit *currunit,
     else
     {
         snext = runq_pick(ops, cpumask_of(sched_cpu));
+
         if ( snext == NULL )
             snext = rt_unit(sched_idle_unit(sched_cpu));
+        else if ( !unit_runnable_state(snext->unit) )
+        {
+            q_remove(snext);
+            snext = rt_unit(sched_idle_unit(sched_cpu));
+        }
 
         /* if scurr has higher priority and budget, still pick scurr */
         if ( !is_idle_unit(currunit) &&
-             unit_runnable(currunit) &&
+             unit_runnable_state(currunit) &&
              scurr->cur_budget > 0 &&
              ( is_idle_unit(snext->unit) ||
                compare_unit_priority(scurr, snext) > 0 ) )
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index 0160c30465..2ecb76e3b9 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -266,7 +266,7 @@  static inline void sched_unit_runstate_change(struct sched_unit *unit,
     struct vcpu *v = unit->vcpu_list;
 
     if ( running )
-        vcpu_runstate_change(v, RUNSTATE_running, new_entry_time);
+        vcpu_runstate_change(v, v->new_state, new_entry_time);
     else
         vcpu_runstate_change(v,
             ((v->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h
index f810a9fba1..aa896f49ef 100644
--- a/xen/include/xen/sched-if.h
+++ b/xen/include/xen/sched-if.h
@@ -75,6 +75,20 @@  static inline bool unit_runnable(const struct sched_unit *unit)
     return vcpu_runnable(unit->vcpu_list);
 }
 
+static inline bool unit_runnable_state(const struct sched_unit *unit)
+{
+    struct vcpu *v;
+    bool runnable;
+
+    v = unit->vcpu_list;
+    runnable = vcpu_runnable(v);
+
+    v->new_state = runnable ? RUNSTATE_running
+                            : (v->pause_flags & VPF_blocked)
+                              ? RUNSTATE_blocked : RUNSTATE_offline;
+    return runnable;
+}
+
 static inline void sched_set_res(struct sched_unit *unit,
                                  struct sched_resource *res)
 {
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 8f4e389b3b..7585bd81a2 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -174,6 +174,7 @@  struct vcpu
         XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
     } runstate_guest; /* guest address */
 #endif
+    int              new_state;
 
     /* Has the FPU been initialised? */
     bool             fpu_initialised;