diff mbox

[v3,8/9] KVM-GST: adjust scheduler cpu power

Message ID 1309361388-30163-9-git-send-email-glommer@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Glauber Costa June 29, 2011, 3:29 p.m. UTC
This is a first proposal for using steal time information
to influence the scheduler. There are a lot of optimizations
and fine grained adjustments to be done, but it is working reasonably
so far for me (mostly)

With this patch (and some host pinnings to demonstrate the situation),
two vcpus with very different steal time (Say 80 % vs 1 %) will not get
an even distribution of processes. This is a situation that can naturally
arise, specially in overcommited scenarios. Previosly, the guest scheduler
would wrongly think that all cpus have the same ability to run processes,
lowering the overall throughput.

Signed-off-by: Glauber Costa <glommer@redhat.com>
CC: Rik van Riel <riel@redhat.com>
CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Avi Kivity <avi@redhat.com>
CC: Anthony Liguori <aliguori@us.ibm.com>
CC: Eric B Munson <emunson@mgebm.net>
---
 arch/x86/Kconfig        |   12 ++++++++++++
 kernel/sched.c          |   44 ++++++++++++++++++++++++++++++++++----------
 kernel/sched_features.h |    4 ++--
 3 files changed, 48 insertions(+), 12 deletions(-)

Comments

Eric B Munson June 29, 2011, 9:57 p.m. UTC | #1
On Wed, 29 Jun 2011, Glauber Costa wrote:

> This is a first proposal for using steal time information
> to influence the scheduler. There are a lot of optimizations
> and fine grained adjustments to be done, but it is working reasonably
> so far for me (mostly)
> 
> With this patch (and some host pinnings to demonstrate the situation),
> two vcpus with very different steal time (Say 80 % vs 1 %) will not get
> an even distribution of processes. This is a situation that can naturally
> arise, specially in overcommited scenarios. Previosly, the guest scheduler
> would wrongly think that all cpus have the same ability to run processes,
> lowering the overall throughput.
> 
> Signed-off-by: Glauber Costa <glommer@redhat.com>
> CC: Rik van Riel <riel@redhat.com>
> CC: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Avi Kivity <avi@redhat.com>
> CC: Anthony Liguori <aliguori@us.ibm.com>
> CC: Eric B Munson <emunson@mgebm.net>

Tested-by: Eric B Munson <emunson@mgebm.net>
Peter Zijlstra June 30, 2011, 9:54 p.m. UTC | #2
On Wed, 2011-06-29 at 11:29 -0400, Glauber Costa wrote:
> +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
> +       if (static_branch((&paravirt_steal_rq_enabled))) {
> +               int is_idle;
> +               u64 st;
> +
> +               is_idle = ((rq->curr != rq->idle) ||
> +                               irq_count() != HARDIRQ_OFFSET);

Now that hurt my brain. If the vcpu is idle, why does it want to run?
How can an idle vcpu ever rack up steal time?

Also, what's that HARDIRQ_OFFSET bit about? sorely lacking in
explanation, and the Changelog to this patch is about as bad as the last
one.

> +               __touch_steal_time(is_idle, delta, &st);
> +
> +               steal = st * TICK_NSEC;
> +
> +               delta -= steal;
> +       }
> +#endif 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra June 30, 2011, 9:54 p.m. UTC | #3
On Wed, 2011-06-29 at 11:29 -0400, Glauber Costa wrote:
> +       return __touch_steal_time(is_idle, UINT_MAX, NULL);

That wants to be ULLONG_MAX, because max_steal is a u64, with UINT_MAX
the comparison:

+               if (steal > max_steal)

Isn't true per-se and the compiler cannot optimize the branch away.
Also, make sure it does indeed optimize that branch away, otherwise
simply duplicate the code. This is on the very hottest scheduler paths,
branches do count.

Same for:

+               if (ticks)

with ticks == NULL, make sure it never emits code for the above call.

Ah, also, all that requires you strip that noinline crap you put on it,
if it cannot inline, it cannot assume anything on the input and will
this have to emit all this nonsense.


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index da34972..b26f312 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -512,6 +512,18 @@  menuconfig PARAVIRT_GUEST
 
 if PARAVIRT_GUEST
 
+config PARAVIRT_TIME_ACCOUNTING
+	bool "Paravirtual steal time accounting"
+	select PARAVIRT
+	default n
+	---help---
+	  Select this option to enable fine granularity task steal time 
+	  accounting. Time spent executing other tasks in parallel with
+	  the current vCPU is discounted from the vCPU power. To account for
+	  that, there can be a small performance impact.
+
+	  If in doubt, say N here.
+
 source "arch/x86/xen/Kconfig"
 
 config KVM_CLOCK
diff --git a/kernel/sched.c b/kernel/sched.c
index c166863..3ef0de9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1975,7 +1975,7 @@  static inline u64 steal_ticks(u64 steal)
  * tell the underlying hypervisor that we grabbed the data, but skip steal time
  * accounting
  */
-static noinline bool touch_steal_time(int is_idle)
+static noinline bool __touch_steal_time(int is_idle, u64 max_steal, u64 *ticks)
 {
 	u64 steal, st = 0;
 
@@ -1985,8 +1985,13 @@  static noinline bool touch_steal_time(int is_idle)
 
 		steal -= this_rq()->prev_steal_time;
 
+		if (steal > max_steal)
+			steal = max_steal;
+
 		st = steal_ticks(steal);
 		this_rq()->prev_steal_time += st * TICK_NSEC;
+		if (ticks)
+			*ticks = st;
 
 		if (is_idle || st == 0)
 			return false;
@@ -1997,10 +2002,16 @@  static noinline bool touch_steal_time(int is_idle)
 	return false;
 }
 
+static inline bool touch_steal_time(int is_idle)
+{
+	return __touch_steal_time(is_idle, UINT_MAX, NULL);
+}
+
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-	s64 irq_delta;
+	s64 irq_delta = 0, steal = 0;
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
 
 	/*
@@ -2023,12 +2034,30 @@  static void update_rq_clock_task(struct rq *rq, s64 delta)
 
 	rq->prev_irq_time += irq_delta;
 	delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+	if (static_branch((&paravirt_steal_rq_enabled))) {
+		int is_idle;
+		u64 st;
+
+		is_idle = ((rq->curr != rq->idle) ||
+				irq_count() != HARDIRQ_OFFSET);
+
+		__touch_steal_time(is_idle, delta, &st);
+
+		steal = st * TICK_NSEC;
+
+		delta -= steal;
+	}
+#endif
+
 	rq->clock_task += delta;
 
-	if (irq_delta && sched_feat(NONIRQ_POWER))
-		sched_rt_avg_update(rq, irq_delta);
+	if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+		sched_rt_avg_update(rq, irq_delta + steal);
 }
 
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 static int irqtime_account_hi_update(void)
 {
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -2063,12 +2092,7 @@  static int irqtime_account_si_update(void)
 
 #define sched_clock_irqtime	(0)
 
-static void update_rq_clock_task(struct rq *rq, s64 delta)
-{
-	rq->clock_task += delta;
-}
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#endif
 
 #include "sched_idletask.c"
 #include "sched_fair.c"
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index be40f73..ca3b025 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,9 +61,9 @@  SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(OWNER_SPIN, 1)
 
 /*
- * Decrement CPU power based on irq activity
+ * Decrement CPU power based on time not spent running tasks
  */
-SCHED_FEAT(NONIRQ_POWER, 1)
+SCHED_FEAT(NONTASK_POWER, 1)
 
 /*
  * Queue remote wakeups on the target CPU and process them