diff mbox

[v3,3/7] xen: credit: consider tickled pCPUs as busy.

Message ID 149158420588.32558.740383499165076337.stgit@Solace.fritz.box (mailing list archive)
State New, archived
Headers show

Commit Message

Dario Faggioli April 7, 2017, 4:56 p.m. UTC
Currently, it can happen that __runq_tickle(),
running on pCPU 2 because vCPU x woke up, decides
to tickle pCPU 3, because it's idle. Just after
that, but before pCPU 3 manages to schedule and
pick up x, either __runq_tickel() or
__csched_cpu_pick(), running on pCPU 6, sees that
idle pCPUs are 0, 1 and also 3, and for whatever
reason it also chooses 3 for waking up (or
migrating) vCPU y.

When pCPU 3 goes through the scheduler, it will
pick up, say, vCPU x, and y will sit in its
runqueue, even if there are idle pCPUs.

Alleviate this by marking a pCPU to be idle
right away when tickling it (like, e.g., it happens
in Credit2).

Note that this does not eliminate the race. That
is not possible without introducing proper locking
for the cpumasks the scheduler uses. It significantly
reduces the window during which it can happen, though.

Introduce proper locking for the cpumask can, in
theory, be done, and may be investigated in future.
It is a significant amount of work to do it properly
(e.g., avoiding deadlock), and it is likely to adversely
affect scalability, and so it may be a path it is just
not worth following.

Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
---
Cc: George Dunlap <george.dunlap@eu.citrix.com>
Cc: Anshul Makkar <anshul.makkar@citrix.com>
---
Changes from v2:
* add comments to better explain the meaning of the added ASSERT()-s.
---
 xen/common/sched_credit.c |   32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

Comments

George Dunlap April 7, 2017, 5:12 p.m. UTC | #1
On Fri, Apr 7, 2017 at 5:56 PM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
> Currently, it can happen that __runq_tickle(),
> running on pCPU 2 because vCPU x woke up, decides
> to tickle pCPU 3, because it's idle. Just after
> that, but before pCPU 3 manages to schedule and
> pick up x, either __runq_tickel() or
> __csched_cpu_pick(), running on pCPU 6, sees that
> idle pCPUs are 0, 1 and also 3, and for whatever
> reason it also chooses 3 for waking up (or
> migrating) vCPU y.
>
> When pCPU 3 goes through the scheduler, it will
> pick up, say, vCPU x, and y will sit in its
> runqueue, even if there are idle pCPUs.
>
> Alleviate this by marking a pCPU to be idle
> right away when tickling it (like, e.g., it happens
> in Credit2).
>
> Note that this does not eliminate the race. That
> is not possible without introducing proper locking
> for the cpumasks the scheduler uses. It significantly
> reduces the window during which it can happen, though.
>
> Introduce proper locking for the cpumask can, in
> theory, be done, and may be investigated in future.
> It is a significant amount of work to do it properly
> (e.g., avoiding deadlock), and it is likely to adversely
> affect scalability, and so it may be a path it is just
> not worth following.
>
> Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
> ---
> Cc: George Dunlap <george.dunlap@eu.citrix.com>
> Cc: Anshul Makkar <anshul.makkar@citrix.com>
> ---
> Changes from v2:
> * add comments to better explain the meaning of the added ASSERT()-s.

Thanks.

Reviewed-by: George Dunlap <george.dunlap@citrix.com>

> ---
>  xen/common/sched_credit.c |   32 +++++++++++++++++++++++++++++---
>  1 file changed, 29 insertions(+), 3 deletions(-)
>
> diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
> index 59ed4ca..7b4ea02 100644
> --- a/xen/common/sched_credit.c
> +++ b/xen/common/sched_credit.c
> @@ -489,7 +489,17 @@ static inline void __runq_tickle(struct csched_vcpu *new)
>                  __trace_var(TRC_CSCHED_TICKLE, 1, sizeof(cpu), &cpu);
>          }
>
> -        /* Send scheduler interrupts to designated CPUs */
> +        /*
> +         * Mark the designated CPUs as busy and send them all the scheduler
> +         * interrupt. We need the for_each_cpu for dealing with the
> +         * !opt_tickle_one_idle case. We must use cpumask_clear_cpu() and
> +         * can't use cpumask_andnot(), because prv->idlers needs atomic access.
> +         *
> +         * In the default (and most common) case, when opt_rickle_one_idle is
> +         * true, the loop does only one step, and only one bit is cleared.
> +         */
> +        for_each_cpu(cpu, &mask)
> +            cpumask_clear_cpu(cpu, prv->idlers);
>          cpumask_raise_softirq(&mask, SCHEDULE_SOFTIRQ);
>      }
>      else
> @@ -990,6 +1000,13 @@ csched_vcpu_acct(struct csched_private *prv, unsigned int cpu)
>              SCHED_VCPU_STAT_CRANK(svc, migrate_r);
>              SCHED_STAT_CRANK(migrate_running);
>              set_bit(_VPF_migrating, &current->pause_flags);
> +            /*
> +             * As we are about to tickle cpu, we should clear its bit in
> +             * idlers. But, if we are here, it means there is someone running
> +             * on it, and hence the bit must be zero already.
> +             */
> +            ASSERT(!cpumask_test_cpu(cpu,
> +                                     CSCHED_PRIV(per_cpu(scheduler, cpu))->idlers));
>              cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
>          }
>      }
> @@ -1082,13 +1099,22 @@ static void
>  csched_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
>  {
>      struct csched_vcpu * const svc = CSCHED_VCPU(vc);
> +    unsigned int cpu = vc->processor;
>
>      SCHED_STAT_CRANK(vcpu_sleep);
>
>      BUG_ON( is_idle_vcpu(vc) );
>
> -    if ( curr_on_cpu(vc->processor) == vc )
> -        cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
> +    if ( curr_on_cpu(cpu) == vc )
> +    {
> +        /*
> +         * We are about to tickle cpu, so we should clear its bit in idlers.
> +         * But, we are here because vc is going to sleep while running on cpu,
> +         * so the bit must be zero already.
> +         */
> +        ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(per_cpu(scheduler, cpu))->idlers));
> +        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
> +    }
>      else if ( __vcpu_on_runq(svc) )
>          __runq_remove(svc);
>  }
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> https://lists.xen.org/xen-devel
diff mbox

Patch

diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
index 59ed4ca..7b4ea02 100644
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -489,7 +489,17 @@  static inline void __runq_tickle(struct csched_vcpu *new)
                 __trace_var(TRC_CSCHED_TICKLE, 1, sizeof(cpu), &cpu);
         }
 
-        /* Send scheduler interrupts to designated CPUs */
+        /*
+         * Mark the designated CPUs as busy and send them all the scheduler
+         * interrupt. We need the for_each_cpu for dealing with the
+         * !opt_tickle_one_idle case. We must use cpumask_clear_cpu() and
+         * can't use cpumask_andnot(), because prv->idlers needs atomic access.
+         *
+         * In the default (and most common) case, when opt_rickle_one_idle is
+         * true, the loop does only one step, and only one bit is cleared.
+         */
+        for_each_cpu(cpu, &mask)
+            cpumask_clear_cpu(cpu, prv->idlers);
         cpumask_raise_softirq(&mask, SCHEDULE_SOFTIRQ);
     }
     else
@@ -990,6 +1000,13 @@  csched_vcpu_acct(struct csched_private *prv, unsigned int cpu)
             SCHED_VCPU_STAT_CRANK(svc, migrate_r);
             SCHED_STAT_CRANK(migrate_running);
             set_bit(_VPF_migrating, &current->pause_flags);
+            /*
+             * As we are about to tickle cpu, we should clear its bit in
+             * idlers. But, if we are here, it means there is someone running
+             * on it, and hence the bit must be zero already.
+             */
+            ASSERT(!cpumask_test_cpu(cpu,
+                                     CSCHED_PRIV(per_cpu(scheduler, cpu))->idlers));
             cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
         }
     }
@@ -1082,13 +1099,22 @@  static void
 csched_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
 {
     struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+    unsigned int cpu = vc->processor;
 
     SCHED_STAT_CRANK(vcpu_sleep);
 
     BUG_ON( is_idle_vcpu(vc) );
 
-    if ( curr_on_cpu(vc->processor) == vc )
-        cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
+    if ( curr_on_cpu(cpu) == vc )
+    {
+        /*
+         * We are about to tickle cpu, so we should clear its bit in idlers.
+         * But, we are here because vc is going to sleep while running on cpu,
+         * so the bit must be zero already.
+         */
+        ASSERT(!cpumask_test_cpu(cpu, CSCHED_PRIV(per_cpu(scheduler, cpu))->idlers));
+        cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
+    }
     else if ( __vcpu_on_runq(svc) )
         __runq_remove(svc);
 }