diff mbox

[RFT,v4,4/7] cpuidle: Return nohz hint from cpuidle_select()

Message ID 15847301.VVmNzm1RNz@aspire.rjw.lan (mailing list archive)
State Superseded, archived
Delegated to: Rafael Wysocki
Headers show

Commit Message

Rafael J. Wysocki March 12, 2018, 9:54 a.m. UTC
From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

Add a new pointer argument to cpuidle_select() and to the ->select
cpuidle governor callback to allow a boolean value indicating
whether or not the tick should be stopped before entering the
selected state to be returned from there.

Make the ladder governor ignore that pointer (to preserve its
current behavior) and make the menu governor return 'false" through
it if:
 (1) the idle exit latency is constrained at 0,
 (2) the selected state is a polling one, or
 (3) the target residency of the selected state is less than the
     length of the tick period and there is at least one deeper
     idle state available within the tick period range.

Since the value returned through the new argument pointer is not
used yet, this change is not expected to alter the functionality of
the code.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/cpuidle.c          |   10 ++++++++--
 drivers/cpuidle/governors/ladder.c |    3 ++-
 drivers/cpuidle/governors/menu.c   |   32 +++++++++++++++++++++++++++++---
 include/linux/cpuidle.h            |    8 +++++---
 kernel/sched/idle.c                |    4 +++-
 5 files changed, 47 insertions(+), 10 deletions(-)

Comments

Peter Zijlstra March 14, 2018, 12:59 p.m. UTC | #1
On Mon, Mar 12, 2018 at 10:54:18AM +0100, Rafael J. Wysocki wrote:
> @@ -378,6 +384,26 @@ static int menu_select(struct cpuidle_dr
>  	if (idx == -1)
>  		idx = 0; /* No states enabled. Must use 0. */
>  
> +	if (drv->states[idx].flags & CPUIDLE_FLAG_POLLING) {
> +		*nohz_ret = false;
> +	} else if (drv->states[idx].target_residency < TICK_USEC_HZ) {
> +		first_idx = idx;
> +		for (i = idx + 1; i < drv->state_count; i++) {
> +			if (!drv->states[i].disabled &&
> +			    !dev->states_usage[i].disable) {
> +				first_idx = i;
> +				break;
> +			}
		}
> +
> +		/*
> +		 * Do not stop the tick if there is at least one more state
> +		 * within the tick period range that could be used if longer
> +		 * idle duration was predicted.
> +		 */
> +		*nohz_ret = !(first_idx > idx &&
> +			      drv->states[first_idx].target_residency < TICK_USEC_HZ);


Why though? That comment only states what it does, but gives no clue as
to why we're doing this. What was wrong with disabling NOHZ if the
selected state (@idx) has shorter target residency.


> +	}
> +
>  	data->last_state_idx = idx;
>  
>  	return data->last_state_idx;
>
Rafael J. Wysocki March 15, 2018, 12:54 p.m. UTC | #2
On Wednesday, March 14, 2018 1:59:29 PM CET Peter Zijlstra wrote:
> On Mon, Mar 12, 2018 at 10:54:18AM +0100, Rafael J. Wysocki wrote:
> > @@ -378,6 +384,26 @@ static int menu_select(struct cpuidle_dr
> >  	if (idx == -1)
> >  		idx = 0; /* No states enabled. Must use 0. */
> >  
> > +	if (drv->states[idx].flags & CPUIDLE_FLAG_POLLING) {
> > +		*nohz_ret = false;
> > +	} else if (drv->states[idx].target_residency < TICK_USEC_HZ) {
> > +		first_idx = idx;
> > +		for (i = idx + 1; i < drv->state_count; i++) {
> > +			if (!drv->states[i].disabled &&
> > +			    !dev->states_usage[i].disable) {
> > +				first_idx = i;
> > +				break;
> > +			}
> 		}
> > +
> > +		/*
> > +		 * Do not stop the tick if there is at least one more state
> > +		 * within the tick period range that could be used if longer
> > +		 * idle duration was predicted.
> > +		 */
> > +		*nohz_ret = !(first_idx > idx &&
> > +			      drv->states[first_idx].target_residency < TICK_USEC_HZ);
> 
> 
> Why though? That comment only states what it does, but gives no clue as
> to why we're doing this. What was wrong with disabling NOHZ if the
> selected state (@idx) has shorter target residency.
> 
> 
> > +	}
> > +
> >  	data->last_state_idx = idx;
> >  
> >  	return data->last_state_idx;
> > 
> 

I invented this complicated logic because of the concern that using
predicted_us alone may skew the prediction algotithm too much towards
the lower values, so the idea was (roughly) to allow CPUs to be idle
for longer times more aggressively.  That is, to stop the tick even
in some cases in which predicted_us was below the tick period length,
but then again not too often.

It appears to work, but you are right that it is confusing and on
a second thought simpler code is always better as long as it gets the
job done.

I'll rework this and resend the series later today.

Thanks!
diff mbox

Patch

Index: linux-pm/include/linux/cpuidle.h
===================================================================
--- linux-pm.orig/include/linux/cpuidle.h
+++ linux-pm/include/linux/cpuidle.h
@@ -131,7 +131,8 @@  extern bool cpuidle_not_available(struct
 				  struct cpuidle_device *dev);
 
 extern int cpuidle_select(struct cpuidle_driver *drv,
-			  struct cpuidle_device *dev);
+			  struct cpuidle_device *dev,
+			  bool *nohz_ret);
 extern int cpuidle_enter(struct cpuidle_driver *drv,
 			 struct cpuidle_device *dev, int index);
 extern void cpuidle_reflect(struct cpuidle_device *dev, int index);
@@ -163,7 +164,7 @@  static inline bool cpuidle_not_available
 					 struct cpuidle_device *dev)
 {return true; }
 static inline int cpuidle_select(struct cpuidle_driver *drv,
-				 struct cpuidle_device *dev)
+				 struct cpuidle_device *dev, bool *nohz_ret)
 {return -ENODEV; }
 static inline int cpuidle_enter(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev, int index)
@@ -246,7 +247,8 @@  struct cpuidle_governor {
 					struct cpuidle_device *dev);
 
 	int  (*select)		(struct cpuidle_driver *drv,
-					struct cpuidle_device *dev);
+					struct cpuidle_device *dev,
+					bool *nohz_ret);
 	void (*reflect)		(struct cpuidle_device *dev, int index);
 };
 
Index: linux-pm/kernel/sched/idle.c
===================================================================
--- linux-pm.orig/kernel/sched/idle.c
+++ linux-pm/kernel/sched/idle.c
@@ -188,13 +188,15 @@  static void cpuidle_idle_call(void)
 		next_state = cpuidle_find_deepest_state(drv, dev);
 		call_cpuidle(drv, dev, next_state);
 	} else {
+		bool nohz = true;
+
 		tick_nohz_idle_stop_tick();
 		rcu_idle_enter();
 
 		/*
 		 * Ask the cpuidle framework to choose a convenient idle state.
 		 */
-		next_state = cpuidle_select(drv, dev);
+		next_state = cpuidle_select(drv, dev, &nohz);
 		entered_state = call_cpuidle(drv, dev, next_state);
 		/*
 		 * Give the governor an opportunity to reflect on the outcome
Index: linux-pm/drivers/cpuidle/cpuidle.c
===================================================================
--- linux-pm.orig/drivers/cpuidle/cpuidle.c
+++ linux-pm/drivers/cpuidle/cpuidle.c
@@ -263,12 +263,18 @@  int cpuidle_enter_state(struct cpuidle_d
  *
  * @drv: the cpuidle driver
  * @dev: the cpuidle device
+ * @nohz_ret: indication on whether or not to stop the tick
  *
  * Returns the index of the idle state.  The return value must not be negative.
+ *
+ * The memory location pointed to by @nohz_ret is expected to be written the
+ * 'false' boolean value if the scheduler tick should not be stopped before
+ * entering the returned state.
  */
-int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		   bool *nohz_ret)
 {
-	return cpuidle_curr_governor->select(drv, dev);
+	return cpuidle_curr_governor->select(drv, dev, nohz_ret);
 }
 
 /**
Index: linux-pm/drivers/cpuidle/governors/ladder.c
===================================================================
--- linux-pm.orig/drivers/cpuidle/governors/ladder.c
+++ linux-pm/drivers/cpuidle/governors/ladder.c
@@ -63,9 +63,10 @@  static inline void ladder_do_selection(s
  * ladder_select_state - selects the next state to enter
  * @drv: cpuidle driver
  * @dev: the CPU
+ * @dummy: not used
  */
 static int ladder_select_state(struct cpuidle_driver *drv,
-				struct cpuidle_device *dev)
+			       struct cpuidle_device *dev, bool *dummy)
 {
 	struct ladder_device *ldev = this_cpu_ptr(&ladder_devices);
 	struct device *device = get_cpu_device(dev->cpu);
Index: linux-pm/drivers/cpuidle/governors/menu.c
===================================================================
--- linux-pm.orig/drivers/cpuidle/governors/menu.c
+++ linux-pm/drivers/cpuidle/governors/menu.c
@@ -275,12 +275,16 @@  again:
 	goto again;
 }
 
+#define TICK_USEC_HZ   ((USEC_PER_SEC + HZ/2) / HZ)
+
 /**
  * menu_select - selects the next idle state to enter
  * @drv: cpuidle driver containing state data
  * @dev: the CPU
+ * @nohz_ret: indication on whether or not to stop the tick
  */
-static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+		       bool *nohz_ret)
 {
 	struct menu_device *data = this_cpu_ptr(&menu_devices);
 	struct device *device = get_cpu_device(dev->cpu);
@@ -303,8 +307,10 @@  static int menu_select(struct cpuidle_dr
 		latency_req = resume_latency;
 
 	/* Special case when user has set very strict latency requirement */
-	if (unlikely(latency_req == 0))
+	if (unlikely(latency_req == 0)) {
+		*nohz_ret = false;
 		return 0;
+	}
 
 	/* determine the expected residency time, round up */
 	data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
@@ -369,7 +375,7 @@  static int menu_select(struct cpuidle_dr
 			idx = i; /* first enabled state */
 		if (s->target_residency > data->predicted_us)
 			break;
-		if (s->exit_latency > latency_req)
+	        if (s->exit_latency > latency_req)
 			break;
 
 		idx = i;
@@ -378,6 +384,26 @@  static int menu_select(struct cpuidle_dr
 	if (idx == -1)
 		idx = 0; /* No states enabled. Must use 0. */
 
+	if (drv->states[idx].flags & CPUIDLE_FLAG_POLLING) {
+		*nohz_ret = false;
+	} else if (drv->states[idx].target_residency < TICK_USEC_HZ) {
+		first_idx = idx;
+		for (i = idx + 1; i < drv->state_count; i++)
+			if (!drv->states[i].disabled &&
+			    !dev->states_usage[i].disable) {
+				first_idx = i;
+				break;
+			}
+
+		/*
+		 * Do not stop the tick if there is at least one more state
+		 * within the tick period range that could be used if longer
+		 * idle duration was predicted.
+		 */
+		*nohz_ret = !(first_idx > idx &&
+			      drv->states[first_idx].target_residency < TICK_USEC_HZ);
+	}
+
 	data->last_state_idx = idx;
 
 	return data->last_state_idx;