diff mbox

[v5,5/5] intel_idle: Add S0ix validation

Message ID 20170708000303.21863-5-dbasehore@chromium.org (mailing list archive)
State RFC, archived
Headers show

Commit Message

Derek Basehore July 8, 2017, 12:03 a.m. UTC
This adds validation of S0ix entry and enables it on Skylake. Using
the new tick_set_freeze_event function, we program the CPU to wake up
X seconds after entering freeze. After X seconds, it will wake the CPU
to check the S0ix residency counters and make sure we entered the
lowest power state for suspend-to-idle.

It exits freeze and reports an error to userspace when the SoC does
not enter S0ix on suspend-to-idle.

One example of a bug that can prevent a Skylake CPU from entering S0ix
(suspend-to-idle) is a leaked reference count to one of the i915 power
wells. The CPU will not be able to enter Package C10 and will
therefore use about 4x as much power for the entire system. The issue
is not specific to the i915 power wells though.

Signed-off-by: Derek Basehore <dbasehore@chromium.org>
---
 drivers/idle/intel_idle.c | 142 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 134 insertions(+), 8 deletions(-)

Comments

kernel test robot July 9, 2017, 7:13 a.m. UTC | #1
Hi Derek,

[auto build test ERROR on pm/linux-next]
[also build test ERROR on next-20170707]
[cannot apply to v4.12]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Derek-Basehore/x86-stub-out-pmc-function/20170709-134714
base:   https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git linux-next
config: x86_64-randconfig-x015-201728 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   drivers/idle/intel_idle.c: In function 'intel_idle_freeze_and_check':
>> drivers/idle/intel_idle.c:1022:9: error: implicit declaration of function 'tick_set_freeze_event' [-Werror=implicit-function-declaration]
      ret = tick_set_freeze_event(cpu, ktime_set(slp_s0_seconds, 0));
            ^~~~~~~~~~~~~~~~~~~~~
>> drivers/idle/intel_idle.c:1042:27: error: implicit declaration of function 'tick_clear_freeze_event' [-Werror=implicit-function-declaration]
     if (check_on_this_cpu && tick_clear_freeze_event(cpu))
                              ^~~~~~~~~~~~~~~~~~~~~~~
   Cyclomatic Complexity 2 arch/x86/include/asm/bitops.h:set_bit
   Cyclomatic Complexity 2 arch/x86/include/asm/bitops.h:clear_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:constant_test_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:variable_test_bit
   Cyclomatic Complexity 1 arch/x86/include/asm/current.h:get_current
   Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:__cpuid
   Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:paravirt_read_msr
   Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:paravirt_write_msr
   Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:wrmsrl
   Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:arch_local_save_flags
   Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:arch_local_irq_restore
   Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:arch_local_irq_disable
   Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:arch_local_irq_save
   Cyclomatic Complexity 1 arch/x86/include/asm/special_insns.h:clflush
   Cyclomatic Complexity 1 include/linux/math64.h:div64_u64
   Cyclomatic Complexity 1 arch/x86/include/asm/processor.h:cpuid
   Cyclomatic Complexity 3 arch/x86/include/asm/cpufeature.h:_static_cpu_has
   Cyclomatic Complexity 1 include/linux/thread_info.h:set_ti_thread_flag
   Cyclomatic Complexity 1 include/linux/thread_info.h:clear_ti_thread_flag
   Cyclomatic Complexity 2 include/linux/thread_info.h:test_ti_thread_flag
   Cyclomatic Complexity 1 arch/x86/include/asm/preempt.h:set_preempt_need_resched
   Cyclomatic Complexity 5 arch/x86/include/asm/preempt.h:__preempt_count_add
   Cyclomatic Complexity 5 arch/x86/include/asm/preempt.h:__preempt_count_sub
   Cyclomatic Complexity 1 include/linux/spinlock.h:spinlock_check
   Cyclomatic Complexity 1 include/linux/spinlock.h:spin_unlock_irqrestore
   Cyclomatic Complexity 2 include/linux/ktime.h:ktime_set
   Cyclomatic Complexity 1 include/linux/sched.h:need_resched
   Cyclomatic Complexity 1 include/linux/tick.h:tick_broadcast_enable
   Cyclomatic Complexity 1 include/linux/tick.h:tick_broadcast_disable
   Cyclomatic Complexity 1 include/linux/tick.h:tick_broadcast_enter
   Cyclomatic Complexity 1 include/linux/tick.h:tick_broadcast_exit
   Cyclomatic Complexity 1 arch/x86/include/asm/mmu.h:leave_mm
   Cyclomatic Complexity 1 include/linux/cpuhotplug.h:cpuhp_setup_state
   Cyclomatic Complexity 1 include/linux/suspend.h:register_pm_notifier
   Cyclomatic Complexity 1 include/linux/suspend.h:unregister_pm_notifier
   Cyclomatic Complexity 1 include/linux/sched/idle.h:__current_set_polling
   Cyclomatic Complexity 1 include/linux/sched/idle.h:current_set_polling_and_test
   Cyclomatic Complexity 1 include/linux/sched/idle.h:__current_clr_polling
   Cyclomatic Complexity 2 include/linux/sched/idle.h:current_clr_polling
   Cyclomatic Complexity 1 arch/x86/include/asm/mwait.h:__monitor
   Cyclomatic Complexity 1 arch/x86/include/asm/mwait.h:__mwait
   Cyclomatic Complexity 5 arch/x86/include/asm/mwait.h:mwait_idle_with_hints
   Cyclomatic Complexity 4 drivers/idle/intel_idle.c:intel_idle
   Cyclomatic Complexity 1 drivers/idle/intel_idle.c:intel_idle_freeze
   Cyclomatic Complexity 2 drivers/idle/intel_idle.c:slp_s0_check_prepare
   Cyclomatic Complexity 2 drivers/idle/intel_idle.c:__setup_broadcast_timer
   Cyclomatic Complexity 1 drivers/idle/intel_idle.c:auto_demotion_disable
   Cyclomatic Complexity 1 drivers/idle/intel_idle.c:c1e_promotion_disable
   Cyclomatic Complexity 5 drivers/idle/intel_idle.c:ivt_idle_state_table_update
   Cyclomatic Complexity 2 drivers/idle/intel_idle.c:irtl_2_usec
   Cyclomatic Complexity 6 drivers/idle/intel_idle.c:bxt_idle_state_table_update
   Cyclomatic Complexity 6 drivers/idle/intel_idle.c:sklh_idle_state_table_update
   Cyclomatic Complexity 4 drivers/idle/intel_idle.c:intel_idle_state_table_update
   Cyclomatic Complexity 4 drivers/idle/intel_idle.c:intel_idle_cpu_init
   Cyclomatic Complexity 3 drivers/idle/intel_idle.c:intel_idle_cpu_online
   Cyclomatic Complexity 7 drivers/idle/intel_idle.c:intel_idle_probe
   Cyclomatic Complexity 3 drivers/idle/intel_idle.c:check_slp_s0
   Cyclomatic Complexity 10 drivers/idle/intel_idle.c:intel_idle_freeze_and_check
   Cyclomatic Complexity 10 drivers/idle/intel_idle.c:intel_idle_cpuidle_driver_init
   Cyclomatic Complexity 2 drivers/idle/intel_idle.c:intel_idle_cpuidle_devices_uninit
   Cyclomatic Complexity 9 drivers/idle/intel_idle.c:intel_idle_init
   cc1: some warnings being treated as errors

vim +/tick_set_freeze_event +1022 drivers/idle/intel_idle.c

  1016		spin_lock_irqsave(&slp_s0_check_lock, flags);
  1017		slp_s0_num_cpus++;
  1018		if (slp_s0_seconds &&
  1019		    slp_s0_num_cpus == num_online_cpus() &&
  1020		    !slp_s0_check_inprogress &&
  1021		    !intel_pmc_slp_s0_counter_read(&slp_s0_saved_count)) {
> 1022			ret = tick_set_freeze_event(cpu, ktime_set(slp_s0_seconds, 0));
  1023			if (ret < 0) {
  1024				spin_unlock_irqrestore(&slp_s0_check_lock, flags);
  1025				goto out;
  1026			}
  1027	
  1028			/*
  1029			 * Make sure check_slp_s0 isn't scheduled on another CPU if it
  1030			 * were to leave freeze and enter it again before this CPU
  1031			 * leaves freeze.
  1032			 */
  1033			slp_s0_check_inprogress = true;
  1034			check_on_this_cpu = true;
  1035		}
  1036		spin_unlock_irqrestore(&slp_s0_check_lock, flags);
  1037	
  1038		ret = intel_idle_freeze(dev, drv, index);
  1039		if (ret < 0)
  1040			goto out;
  1041	
> 1042		if (check_on_this_cpu && tick_clear_freeze_event(cpu))
  1043			ret = check_slp_s0(slp_s0_saved_count);
  1044	
  1045	out:

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Rafael J. Wysocki July 10, 2017, 1:33 p.m. UTC | #2
On Friday, July 07, 2017 05:03:03 PM Derek Basehore wrote:
> This adds validation of S0ix entry and enables it on Skylake. Using
> the new tick_set_freeze_event function, we program the CPU to wake up
> X seconds after entering freeze. After X seconds, it will wake the CPU
> to check the S0ix residency counters and make sure we entered the
> lowest power state for suspend-to-idle.
>
> It exits freeze and reports an error to userspace when the SoC does
> not enter S0ix on suspend-to-idle.
>
 
Honestly, I'm totally unsure about this ATM, as it seems to assume that it
doesn't make senes to stay suspended if SLP_S0 residency is not there, but
that totally may not be the case.

First of all, there are systems in which SLP_S0 is related to about 10%-20% of
the total power draw reduction whereas the remaining 80%-90% comes from PC10
alone.  So, if you can get to PC10, being unable to get SLP_S0 residency on top
of that may not be a big deal after all.  Of course, you may argue that 10%-20%
of battery life while suspended is "a lot", but that really depends on the
possible alternatives.

Second, as far as the alternatives go, it may not be rosy, because there are
systems that don't support S3 (or any other ACPI sleep states at all for that
matter) and suspend-to-idle is the only suspend mechanism available there.
On those systems it still may make sense to use it even though it may not
reduce the power draw that much.  And from some experiments, suspend-to-idle
still extends battery life by 100% over runtime idle even if the system is not
able to get to PC10 most of the time.

While I understand the use case, I don't think it is a binary "yes"-"no" thing
and the focus on just SLP_S0 may be misguided.

> One example of a bug that can prevent a Skylake CPU from entering S0ix
> (suspend-to-idle) is a leaked reference count to one of the i915 power

Suspend-to-idle is not S0ix.  Suspend-to-idle is the state the kernel puts the
system into after echoing "freeze" to /sys/power/state and S0ix is a platform
power state that may or may not be entered as a result of that.

> wells. The CPU will not be able to enter Package C10 and will
> therefore use about 4x as much power for the entire system. The issue
> is not specific to the i915 power wells though.

Well, fair enough, but what if the SoC can enter PC10, but without SLP_S0
residency on top of it?

> Signed-off-by: Derek Basehore <dbasehore@chromium.org>
> ---
>  drivers/idle/intel_idle.c | 142 +++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 134 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
> index ebed3f804291..d38621da6e54 100644
> --- a/drivers/idle/intel_idle.c
> +++ b/drivers/idle/intel_idle.c
> @@ -61,10 +61,12 @@
>  #include <linux/notifier.h>
>  #include <linux/cpu.h>
>  #include <linux/moduleparam.h>
> +#include <linux/suspend.h>
>  #include <asm/cpu_device_id.h>
>  #include <asm/intel-family.h>
>  #include <asm/mwait.h>
>  #include <asm/msr.h>
> +#include <asm/pmc_core.h>
>  
>  #define INTEL_IDLE_VERSION "0.4.1"
>  
> @@ -93,12 +95,29 @@ struct idle_cpu {
>  	bool disable_promotion_to_c1e;
>  };
>  
> +/*
> + * The limit for the exponential backoff for the freeze duration. At this point,
> + * power impact is is far from measurable. It's about 3uW based on scaling from
> + * waking up 10 times a second.
> + */
> +#define MAX_SLP_S0_SECONDS 1000
> +#define SLP_S0_EXP_BASE 10
> +
> +static bool slp_s0_check;
> +static unsigned int slp_s0_seconds;
> +
> +static DEFINE_SPINLOCK(slp_s0_check_lock);
> +static unsigned int slp_s0_num_cpus;
> +static bool slp_s0_check_inprogress;
> +
>  static const struct idle_cpu *icpu;
>  static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
>  static int intel_idle(struct cpuidle_device *dev,
>  			struct cpuidle_driver *drv, int index);
>  static int intel_idle_freeze(struct cpuidle_device *dev,
>  			     struct cpuidle_driver *drv, int index);
> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
> +				       struct cpuidle_driver *drv, int index);
>  static struct cpuidle_state *cpuidle_state_table;
>  
>  /*
> @@ -597,7 +616,7 @@ static struct cpuidle_state skl_cstates[] = {
>  		.exit_latency = 2,
>  		.target_residency = 2,
>  		.enter = &intel_idle,
> -		.enter_freeze = intel_idle_freeze, },
> +		.enter_freeze = intel_idle_freeze_and_check, },

Why do you do this for anything lower than C6?

>  	{
>  		.name = "C1E",
>  		.desc = "MWAIT 0x01",
> @@ -605,7 +624,7 @@ static struct cpuidle_state skl_cstates[] = {
>  		.exit_latency = 10,
>  		.target_residency = 20,
>  		.enter = &intel_idle,
> -		.enter_freeze = intel_idle_freeze, },
> +		.enter_freeze = intel_idle_freeze_and_check, },
>  	{
>  		.name = "C3",
>  		.desc = "MWAIT 0x10",
> @@ -613,7 +632,7 @@ static struct cpuidle_state skl_cstates[] = {
>  		.exit_latency = 70,
>  		.target_residency = 100,
>  		.enter = &intel_idle,
> -		.enter_freeze = intel_idle_freeze, },
> +		.enter_freeze = intel_idle_freeze_and_check, },
>  	{
>  		.name = "C6",
>  		.desc = "MWAIT 0x20",
> @@ -621,7 +640,7 @@ static struct cpuidle_state skl_cstates[] = {
>  		.exit_latency = 85,
>  		.target_residency = 200,
>  		.enter = &intel_idle,
> -		.enter_freeze = intel_idle_freeze, },
> +		.enter_freeze = intel_idle_freeze_and_check, },
>  	{
>  		.name = "C7s",
>  		.desc = "MWAIT 0x33",
> @@ -629,7 +648,7 @@ static struct cpuidle_state skl_cstates[] = {
>  		.exit_latency = 124,
>  		.target_residency = 800,
>  		.enter = &intel_idle,
> -		.enter_freeze = intel_idle_freeze, },
> +		.enter_freeze = intel_idle_freeze_and_check, },
>  	{
>  		.name = "C8",
>  		.desc = "MWAIT 0x40",
> @@ -637,7 +656,7 @@ static struct cpuidle_state skl_cstates[] = {
>  		.exit_latency = 200,
>  		.target_residency = 800,
>  		.enter = &intel_idle,
> -		.enter_freeze = intel_idle_freeze, },
> +		.enter_freeze = intel_idle_freeze_and_check, },
>  	{
>  		.name = "C9",
>  		.desc = "MWAIT 0x50",
> @@ -645,7 +664,7 @@ static struct cpuidle_state skl_cstates[] = {
>  		.exit_latency = 480,
>  		.target_residency = 5000,
>  		.enter = &intel_idle,
> -		.enter_freeze = intel_idle_freeze, },
> +		.enter_freeze = intel_idle_freeze_and_check, },
>  	{
>  		.name = "C10",
>  		.desc = "MWAIT 0x60",
> @@ -653,7 +672,7 @@ static struct cpuidle_state skl_cstates[] = {
>  		.exit_latency = 890,
>  		.target_residency = 5000,
>  		.enter = &intel_idle,
> -		.enter_freeze = intel_idle_freeze, },
> +		.enter_freeze = intel_idle_freeze_and_check, },
>  	{
>  		.enter = NULL }
>  };
> @@ -940,6 +959,8 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
>   * @dev: cpuidle_device
>   * @drv: cpuidle driver
>   * @index: state index
> + *
> + * @return 0 for success, no failure state
>   */
>  static int intel_idle_freeze(struct cpuidle_device *dev,
>  			     struct cpuidle_driver *drv, int index)
> @@ -952,6 +973,101 @@ static int intel_idle_freeze(struct cpuidle_device *dev,
>  	return 0;
>  }
>  
> +static int check_slp_s0(u32 slp_s0_saved_count)
> +{
> +	u32 slp_s0_new_count;
> +
> +	if (intel_pmc_slp_s0_counter_read(&slp_s0_new_count)) {
> +		pr_warn("Unable to read SLP S0 residency counter\n");
> +		return -EIO;
> +	}
> +
> +	if (slp_s0_saved_count == slp_s0_new_count) {
> +		pr_warn("CPU did not enter SLP S0 for suspend-to-idle.\n");
> +		return -EIO;
> +	}
> +
> +	return 0;
> +}
> +
> +/**
> + * intel_idle_freeze_and_check - enters suspend-to-idle and validates the power
> + * state
> + *
> + * This function enters suspend-to-idle with intel_idle_freeze, but also sets up
> + * a timer to check that S0ix (low power state for suspend-to-idle on Intel
> + * CPUs) is properly entered.
> + *
> + * @dev: cpuidle_device
> + * @drv: cpuidle_driver
> + * @index: state index
> + * @return 0 for success, -EERROR if S0ix was not entered.
> + */
> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
> +				       struct cpuidle_driver *drv, int index)
> +{
> +	bool check_on_this_cpu = false;
> +	u32 slp_s0_saved_count;
> +	unsigned long flags;
> +	int cpu = smp_processor_id();
> +	int ret;
> +
> +	/* The last CPU to freeze sets up checking SLP S0 assertion. */
> +	spin_lock_irqsave(&slp_s0_check_lock, flags);
> +	slp_s0_num_cpus++;
> +	if (slp_s0_seconds &&
> +	    slp_s0_num_cpus == num_online_cpus() &&
> +	    !slp_s0_check_inprogress &&
> +	    !intel_pmc_slp_s0_counter_read(&slp_s0_saved_count)) {
> +		ret = tick_set_freeze_event(cpu, ktime_set(slp_s0_seconds, 0));
> +		if (ret < 0) {
> +			spin_unlock_irqrestore(&slp_s0_check_lock, flags);
> +			goto out;
> +		}
> +
> +		/*
> +		 * Make sure check_slp_s0 isn't scheduled on another CPU if it
> +		 * were to leave freeze and enter it again before this CPU
> +		 * leaves freeze.
> +		 */
> +		slp_s0_check_inprogress = true;
> +		check_on_this_cpu = true;
> +	}
> +	spin_unlock_irqrestore(&slp_s0_check_lock, flags);
> +
> +	ret = intel_idle_freeze(dev, drv, index);
> +	if (ret < 0)
> +		goto out;
> +
> +	if (check_on_this_cpu && tick_clear_freeze_event(cpu))
> +		ret = check_slp_s0(slp_s0_saved_count);
> +
> +out:
> +	spin_lock_irqsave(&slp_s0_check_lock, flags);
> +	if (check_on_this_cpu) {
> +		slp_s0_check_inprogress = false;
> +		slp_s0_seconds = min_t(unsigned int,
> +				       SLP_S0_EXP_BASE * slp_s0_seconds,
> +				       MAX_SLP_S0_SECONDS);
> +	}
> +	slp_s0_num_cpus--;
> +	spin_unlock_irqrestore(&slp_s0_check_lock, flags);
> +	return ret;
> +}
> +
> +static int slp_s0_check_prepare(struct notifier_block *nb, unsigned long action,
> +				void *data)
> +{
> +	if (action == PM_SUSPEND_PREPARE)
> +		slp_s0_seconds = slp_s0_check ? 1 : 0;
> +
> +	return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block intel_slp_s0_check_nb = {
> +	.notifier_call = slp_s0_check_prepare,
> +};
> +
>  static void __setup_broadcast_timer(bool on)
>  {
>  	if (on)
> @@ -1454,6 +1570,13 @@ static int __init intel_idle_init(void)
>  		goto init_driver_fail;
>  	}
>  
> +	retval = register_pm_notifier(&intel_slp_s0_check_nb);
> +	if (retval) {
> +		free_percpu(intel_idle_cpuidle_devices);
> +		cpuidle_unregister_driver(&intel_idle_driver);
> +		goto pm_nb_fail;
> +	}
> +
>  	if (boot_cpu_has(X86_FEATURE_ARAT))	/* Always Reliable APIC Timer */
>  		lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
>  
> @@ -1469,6 +1592,8 @@ static int __init intel_idle_init(void)
>  
>  hp_setup_fail:
>  	intel_idle_cpuidle_devices_uninit();
> +	unregister_pm_notifier(&intel_slp_s0_check_nb);
> +pm_nb_fail:
>  	cpuidle_unregister_driver(&intel_idle_driver);
>  init_driver_fail:
>  	free_percpu(intel_idle_cpuidle_devices);
> @@ -1484,3 +1609,4 @@ device_initcall(intel_idle_init);
>   * is the easiest way (currently) to continue doing that.
>   */
>  module_param(max_cstate, int, 0444);
> +module_param(slp_s0_check, bool, 0644);

This has to be documented somehow.

Also, if it is not set, there is a useless overhead every time
intel_idle_freeze_and_check() is called.  It looks like you could use
a static key or similar to avoid that.

Moreover, the notifier is not necessary then as well.

Thanks,
Rafael
Derek Basehore July 10, 2017, 9:57 p.m. UTC | #3
On Mon, Jul 10, 2017 at 6:33 AM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote:
> On Friday, July 07, 2017 05:03:03 PM Derek Basehore wrote:
>> This adds validation of S0ix entry and enables it on Skylake. Using
>> the new tick_set_freeze_event function, we program the CPU to wake up
>> X seconds after entering freeze. After X seconds, it will wake the CPU
>> to check the S0ix residency counters and make sure we entered the
>> lowest power state for suspend-to-idle.
>>
>> It exits freeze and reports an error to userspace when the SoC does
>> not enter S0ix on suspend-to-idle.
>>
>
> Honestly, I'm totally unsure about this ATM, as it seems to assume that it
> doesn't make senes to stay suspended if SLP_S0 residency is not there, but
> that totally may not be the case.
>
> First of all, there are systems in which SLP_S0 is related to about 10%-20% of
> the total power draw reduction whereas the remaining 80%-90% comes from PC10
> alone.  So, if you can get to PC10, being unable to get SLP_S0 residency on top
> of that may not be a big deal after all.  Of course, you may argue that 10%-20%
> of battery life while suspended is "a lot", but that really depends on the
> possible alternatives.
>

We'd have to track actual PC10 residency instead of checking if it's
the requested state since the SoC can enter a higher power package
cstate even if PC10 is requested. I think this can be done by reading
an msr register, though. Is there an example of how PC10 can be
entered without SLP_S0 getting asserted by the way?

Also, this feature is disabled by default, so it doesn't prevent these
use cases.

> Second, as far as the alternatives go, it may not be rosy, because there are
> systems that don't support S3 (or any other ACPI sleep states at all for that
> matter) and suspend-to-idle is the only suspend mechanism available there.
> On those systems it still may make sense to use it even though it may not
> reduce the power draw that much.  And from some experiments, suspend-to-idle
> still extends battery life by 100% over runtime idle even if the system is not
> able to get to PC10 most of the time.

This is off by default.

>
> While I understand the use case, I don't think it is a binary "yes"-"no" thing
> and the focus on just SLP_S0 may be misguided.

Do you have a preference such as being able to set the level that you
want to validate to? For instance, there could be an option to check
that SLP_So is asserted, but there could also be an option to check
for PC9 or PC10 residency. For instance, there could be a module
parameters for setting the validated state:

available_suspend_to_idle_states:
"none pc6 pc9 pc10 slp_s0"

max_suspend_to_idle_state:
"none"

Where the default validated state is none, but it can be set to any of
the states in available_suspend_to_idle_states

>
>> One example of a bug that can prevent a Skylake CPU from entering S0ix
>> (suspend-to-idle) is a leaked reference count to one of the i915 power
>
> Suspend-to-idle is not S0ix.  Suspend-to-idle is the state the kernel puts the
> system into after echoing "freeze" to /sys/power/state and S0ix is a platform
> power state that may or may not be entered as a result of that.
>
>> wells. The CPU will not be able to enter Package C10 and will
>> therefore use about 4x as much power for the entire system. The issue
>> is not specific to the i915 power wells though.
>
> Well, fair enough, but what if the SoC can enter PC10, but without SLP_S0
> residency on top of it?
>
>> Signed-off-by: Derek Basehore <dbasehore@chromium.org>
>> ---
>>  drivers/idle/intel_idle.c | 142 +++++++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 134 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
>> index ebed3f804291..d38621da6e54 100644
>> --- a/drivers/idle/intel_idle.c
>> +++ b/drivers/idle/intel_idle.c
>> @@ -61,10 +61,12 @@
>>  #include <linux/notifier.h>
>>  #include <linux/cpu.h>
>>  #include <linux/moduleparam.h>
>> +#include <linux/suspend.h>
>>  #include <asm/cpu_device_id.h>
>>  #include <asm/intel-family.h>
>>  #include <asm/mwait.h>
>>  #include <asm/msr.h>
>> +#include <asm/pmc_core.h>
>>
>>  #define INTEL_IDLE_VERSION "0.4.1"
>>
>> @@ -93,12 +95,29 @@ struct idle_cpu {
>>       bool disable_promotion_to_c1e;
>>  };
>>
>> +/*
>> + * The limit for the exponential backoff for the freeze duration. At this point,
>> + * power impact is is far from measurable. It's about 3uW based on scaling from
>> + * waking up 10 times a second.
>> + */
>> +#define MAX_SLP_S0_SECONDS 1000
>> +#define SLP_S0_EXP_BASE 10
>> +
>> +static bool slp_s0_check;
>> +static unsigned int slp_s0_seconds;
>> +
>> +static DEFINE_SPINLOCK(slp_s0_check_lock);
>> +static unsigned int slp_s0_num_cpus;
>> +static bool slp_s0_check_inprogress;
>> +
>>  static const struct idle_cpu *icpu;
>>  static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
>>  static int intel_idle(struct cpuidle_device *dev,
>>                       struct cpuidle_driver *drv, int index);
>>  static int intel_idle_freeze(struct cpuidle_device *dev,
>>                            struct cpuidle_driver *drv, int index);
>> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
>> +                                    struct cpuidle_driver *drv, int index);
>>  static struct cpuidle_state *cpuidle_state_table;
>>
>>  /*
>> @@ -597,7 +616,7 @@ static struct cpuidle_state skl_cstates[] = {
>>               .exit_latency = 2,
>>               .target_residency = 2,
>>               .enter = &intel_idle,
>> -             .enter_freeze = intel_idle_freeze, },
>> +             .enter_freeze = intel_idle_freeze_and_check, },
>
> Why do you do this for anything lower than C6?

In that case, it's probably best the fail in these cases if the check
is enabled. The CPU can't enter a lower cstate than the hinted one,
correct?

>
>>       {
>>               .name = "C1E",
>>               .desc = "MWAIT 0x01",
>> @@ -605,7 +624,7 @@ static struct cpuidle_state skl_cstates[] = {
>>               .exit_latency = 10,
>>               .target_residency = 20,
>>               .enter = &intel_idle,
>> -             .enter_freeze = intel_idle_freeze, },
>> +             .enter_freeze = intel_idle_freeze_and_check, },
>>       {
>>               .name = "C3",
>>               .desc = "MWAIT 0x10",
>> @@ -613,7 +632,7 @@ static struct cpuidle_state skl_cstates[] = {
>>               .exit_latency = 70,
>>               .target_residency = 100,
>>               .enter = &intel_idle,
>> -             .enter_freeze = intel_idle_freeze, },
>> +             .enter_freeze = intel_idle_freeze_and_check, },
>>       {
>>               .name = "C6",
>>               .desc = "MWAIT 0x20",
>> @@ -621,7 +640,7 @@ static struct cpuidle_state skl_cstates[] = {
>>               .exit_latency = 85,
>>               .target_residency = 200,
>>               .enter = &intel_idle,
>> -             .enter_freeze = intel_idle_freeze, },
>> +             .enter_freeze = intel_idle_freeze_and_check, },
>>       {
>>               .name = "C7s",
>>               .desc = "MWAIT 0x33",
>> @@ -629,7 +648,7 @@ static struct cpuidle_state skl_cstates[] = {
>>               .exit_latency = 124,
>>               .target_residency = 800,
>>               .enter = &intel_idle,
>> -             .enter_freeze = intel_idle_freeze, },
>> +             .enter_freeze = intel_idle_freeze_and_check, },
>>       {
>>               .name = "C8",
>>               .desc = "MWAIT 0x40",
>> @@ -637,7 +656,7 @@ static struct cpuidle_state skl_cstates[] = {
>>               .exit_latency = 200,
>>               .target_residency = 800,
>>               .enter = &intel_idle,
>> -             .enter_freeze = intel_idle_freeze, },
>> +             .enter_freeze = intel_idle_freeze_and_check, },
>>       {
>>               .name = "C9",
>>               .desc = "MWAIT 0x50",
>> @@ -645,7 +664,7 @@ static struct cpuidle_state skl_cstates[] = {
>>               .exit_latency = 480,
>>               .target_residency = 5000,
>>               .enter = &intel_idle,
>> -             .enter_freeze = intel_idle_freeze, },
>> +             .enter_freeze = intel_idle_freeze_and_check, },
>>       {
>>               .name = "C10",
>>               .desc = "MWAIT 0x60",
>> @@ -653,7 +672,7 @@ static struct cpuidle_state skl_cstates[] = {
>>               .exit_latency = 890,
>>               .target_residency = 5000,
>>               .enter = &intel_idle,
>> -             .enter_freeze = intel_idle_freeze, },
>> +             .enter_freeze = intel_idle_freeze_and_check, },
>>       {
>>               .enter = NULL }
>>  };
>> @@ -940,6 +959,8 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
>>   * @dev: cpuidle_device
>>   * @drv: cpuidle driver
>>   * @index: state index
>> + *
>> + * @return 0 for success, no failure state
>>   */
>>  static int intel_idle_freeze(struct cpuidle_device *dev,
>>                            struct cpuidle_driver *drv, int index)
>> @@ -952,6 +973,101 @@ static int intel_idle_freeze(struct cpuidle_device *dev,
>>       return 0;
>>  }
>>
>> +static int check_slp_s0(u32 slp_s0_saved_count)
>> +{
>> +     u32 slp_s0_new_count;
>> +
>> +     if (intel_pmc_slp_s0_counter_read(&slp_s0_new_count)) {
>> +             pr_warn("Unable to read SLP S0 residency counter\n");
>> +             return -EIO;
>> +     }
>> +
>> +     if (slp_s0_saved_count == slp_s0_new_count) {
>> +             pr_warn("CPU did not enter SLP S0 for suspend-to-idle.\n");
>> +             return -EIO;
>> +     }
>> +
>> +     return 0;
>> +}
>> +
>> +/**
>> + * intel_idle_freeze_and_check - enters suspend-to-idle and validates the power
>> + * state
>> + *
>> + * This function enters suspend-to-idle with intel_idle_freeze, but also sets up
>> + * a timer to check that S0ix (low power state for suspend-to-idle on Intel
>> + * CPUs) is properly entered.
>> + *
>> + * @dev: cpuidle_device
>> + * @drv: cpuidle_driver
>> + * @index: state index
>> + * @return 0 for success, -EERROR if S0ix was not entered.
>> + */
>> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
>> +                                    struct cpuidle_driver *drv, int index)
>> +{
>> +     bool check_on_this_cpu = false;
>> +     u32 slp_s0_saved_count;
>> +     unsigned long flags;
>> +     int cpu = smp_processor_id();
>> +     int ret;
>> +
>> +     /* The last CPU to freeze sets up checking SLP S0 assertion. */
>> +     spin_lock_irqsave(&slp_s0_check_lock, flags);
>> +     slp_s0_num_cpus++;
>> +     if (slp_s0_seconds &&
>> +         slp_s0_num_cpus == num_online_cpus() &&
>> +         !slp_s0_check_inprogress &&
>> +         !intel_pmc_slp_s0_counter_read(&slp_s0_saved_count)) {
>> +             ret = tick_set_freeze_event(cpu, ktime_set(slp_s0_seconds, 0));
>> +             if (ret < 0) {
>> +                     spin_unlock_irqrestore(&slp_s0_check_lock, flags);
>> +                     goto out;
>> +             }
>> +
>> +             /*
>> +              * Make sure check_slp_s0 isn't scheduled on another CPU if it
>> +              * were to leave freeze and enter it again before this CPU
>> +              * leaves freeze.
>> +              */
>> +             slp_s0_check_inprogress = true;
>> +             check_on_this_cpu = true;
>> +     }
>> +     spin_unlock_irqrestore(&slp_s0_check_lock, flags);
>> +
>> +     ret = intel_idle_freeze(dev, drv, index);
>> +     if (ret < 0)
>> +             goto out;
>> +
>> +     if (check_on_this_cpu && tick_clear_freeze_event(cpu))
>> +             ret = check_slp_s0(slp_s0_saved_count);
>> +
>> +out:
>> +     spin_lock_irqsave(&slp_s0_check_lock, flags);
>> +     if (check_on_this_cpu) {
>> +             slp_s0_check_inprogress = false;
>> +             slp_s0_seconds = min_t(unsigned int,
>> +                                    SLP_S0_EXP_BASE * slp_s0_seconds,
>> +                                    MAX_SLP_S0_SECONDS);
>> +     }
>> +     slp_s0_num_cpus--;
>> +     spin_unlock_irqrestore(&slp_s0_check_lock, flags);
>> +     return ret;
>> +}
>> +
>> +static int slp_s0_check_prepare(struct notifier_block *nb, unsigned long action,
>> +                             void *data)
>> +{
>> +     if (action == PM_SUSPEND_PREPARE)
>> +             slp_s0_seconds = slp_s0_check ? 1 : 0;
>> +
>> +     return NOTIFY_DONE;
>> +}
>> +
>> +static struct notifier_block intel_slp_s0_check_nb = {
>> +     .notifier_call = slp_s0_check_prepare,
>> +};
>> +
>>  static void __setup_broadcast_timer(bool on)
>>  {
>>       if (on)
>> @@ -1454,6 +1570,13 @@ static int __init intel_idle_init(void)
>>               goto init_driver_fail;
>>       }
>>
>> +     retval = register_pm_notifier(&intel_slp_s0_check_nb);
>> +     if (retval) {
>> +             free_percpu(intel_idle_cpuidle_devices);
>> +             cpuidle_unregister_driver(&intel_idle_driver);
>> +             goto pm_nb_fail;
>> +     }
>> +
>>       if (boot_cpu_has(X86_FEATURE_ARAT))     /* Always Reliable APIC Timer */
>>               lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
>>
>> @@ -1469,6 +1592,8 @@ static int __init intel_idle_init(void)
>>
>>  hp_setup_fail:
>>       intel_idle_cpuidle_devices_uninit();
>> +     unregister_pm_notifier(&intel_slp_s0_check_nb);
>> +pm_nb_fail:
>>       cpuidle_unregister_driver(&intel_idle_driver);
>>  init_driver_fail:
>>       free_percpu(intel_idle_cpuidle_devices);
>> @@ -1484,3 +1609,4 @@ device_initcall(intel_idle_init);
>>   * is the easiest way (currently) to continue doing that.
>>   */
>>  module_param(max_cstate, int, 0444);
>> +module_param(slp_s0_check, bool, 0644);
>
> This has to be documented somehow.
>
> Also, if it is not set, there is a useless overhead every time
> intel_idle_freeze_and_check() is called.  It looks like you could use
> a static key or similar to avoid that.
>
> Moreover, the notifier is not necessary then as well.
>
> Thanks,
> Rafael
>
Rafael J. Wysocki July 10, 2017, 10:09 p.m. UTC | #4
On Monday, July 10, 2017 02:57:48 PM dbasehore . wrote:
> On Mon, Jul 10, 2017 at 6:33 AM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote:
> > On Friday, July 07, 2017 05:03:03 PM Derek Basehore wrote:
> >> This adds validation of S0ix entry and enables it on Skylake. Using
> >> the new tick_set_freeze_event function, we program the CPU to wake up
> >> X seconds after entering freeze. After X seconds, it will wake the CPU
> >> to check the S0ix residency counters and make sure we entered the
> >> lowest power state for suspend-to-idle.
> >>
> >> It exits freeze and reports an error to userspace when the SoC does
> >> not enter S0ix on suspend-to-idle.
> >>
> >
> > Honestly, I'm totally unsure about this ATM, as it seems to assume that it
> > doesn't make senes to stay suspended if SLP_S0 residency is not there, but
> > that totally may not be the case.
> >
> > First of all, there are systems in which SLP_S0 is related to about 10%-20% of
> > the total power draw reduction whereas the remaining 80%-90% comes from PC10
> > alone.  So, if you can get to PC10, being unable to get SLP_S0 residency on top
> > of that may not be a big deal after all.  Of course, you may argue that 10%-20%
> > of battery life while suspended is "a lot", but that really depends on the
> > possible alternatives.
> >
> 
> We'd have to track actual PC10 residency instead of checking if it's
> the requested state since the SoC can enter a higher power package
> cstate even if PC10 is requested.

That's correct, but it should be sufficient to check the PC10 residency
(there's some code to do that in turbostat, for example).

> I think this can be done by reading
> an msr register, though. Is there an example of how PC10 can be
> entered without SLP_S0 getting asserted by the way?

Yes, there is.

PC10 is a power state of the north complex and it can be entered regardless
of the SLP_S0 status which is related to the south complex.

> Also, this feature is disabled by default, so it doesn't prevent these
> use cases.
> 
> > Second, as far as the alternatives go, it may not be rosy, because there are
> > systems that don't support S3 (or any other ACPI sleep states at all for that
> > matter) and suspend-to-idle is the only suspend mechanism available there.
> > On those systems it still may make sense to use it even though it may not
> > reduce the power draw that much.  And from some experiments, suspend-to-idle
> > still extends battery life by 100% over runtime idle even if the system is not
> > able to get to PC10 most of the time.
> 
> This is off by default.

Fair enough, but even so it may not be very useful in general as is.

> >
> > While I understand the use case, I don't think it is a binary "yes"-"no" thing
> > and the focus on just SLP_S0 may be misguided.
> 
> Do you have a preference such as being able to set the level that you
> want to validate to? For instance, there could be an option to check
> that SLP_So is asserted, but there could also be an option to check
> for PC9 or PC10 residency. For instance, there could be a module
> parameters for setting the validated state:
> 
> available_suspend_to_idle_states:
> "none pc6 pc9 pc10 slp_s0"
> 
> max_suspend_to_idle_state:
> "none"
> 
> Where the default validated state is none, but it can be set to any of
> the states in available_suspend_to_idle_states

In the suspend-to-idle path the driver will always request the deepest state
available (C10 for Skylake) and I would validate the associated package state
by default plus optionally SLP_S0.

> >
> >> One example of a bug that can prevent a Skylake CPU from entering S0ix
> >> (suspend-to-idle) is a leaked reference count to one of the i915 power
> >
> > Suspend-to-idle is not S0ix.  Suspend-to-idle is the state the kernel puts the
> > system into after echoing "freeze" to /sys/power/state and S0ix is a platform
> > power state that may or may not be entered as a result of that.
> >
> >> wells. The CPU will not be able to enter Package C10 and will
> >> therefore use about 4x as much power for the entire system. The issue
> >> is not specific to the i915 power wells though.
> >
> > Well, fair enough, but what if the SoC can enter PC10, but without SLP_S0
> > residency on top of it?
> >
> >> Signed-off-by: Derek Basehore <dbasehore@chromium.org>
> >> ---
> >>  drivers/idle/intel_idle.c | 142 +++++++++++++++++++++++++++++++++++++++++++---
> >>  1 file changed, 134 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
> >> index ebed3f804291..d38621da6e54 100644
> >> --- a/drivers/idle/intel_idle.c
> >> +++ b/drivers/idle/intel_idle.c
> >> @@ -61,10 +61,12 @@
> >>  #include <linux/notifier.h>
> >>  #include <linux/cpu.h>
> >>  #include <linux/moduleparam.h>
> >> +#include <linux/suspend.h>
> >>  #include <asm/cpu_device_id.h>
> >>  #include <asm/intel-family.h>
> >>  #include <asm/mwait.h>
> >>  #include <asm/msr.h>
> >> +#include <asm/pmc_core.h>
> >>
> >>  #define INTEL_IDLE_VERSION "0.4.1"
> >>
> >> @@ -93,12 +95,29 @@ struct idle_cpu {
> >>       bool disable_promotion_to_c1e;
> >>  };
> >>
> >> +/*
> >> + * The limit for the exponential backoff for the freeze duration. At this point,
> >> + * power impact is is far from measurable. It's about 3uW based on scaling from
> >> + * waking up 10 times a second.
> >> + */
> >> +#define MAX_SLP_S0_SECONDS 1000
> >> +#define SLP_S0_EXP_BASE 10
> >> +
> >> +static bool slp_s0_check;
> >> +static unsigned int slp_s0_seconds;
> >> +
> >> +static DEFINE_SPINLOCK(slp_s0_check_lock);
> >> +static unsigned int slp_s0_num_cpus;
> >> +static bool slp_s0_check_inprogress;
> >> +
> >>  static const struct idle_cpu *icpu;
> >>  static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
> >>  static int intel_idle(struct cpuidle_device *dev,
> >>                       struct cpuidle_driver *drv, int index);
> >>  static int intel_idle_freeze(struct cpuidle_device *dev,
> >>                            struct cpuidle_driver *drv, int index);
> >> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
> >> +                                    struct cpuidle_driver *drv, int index);
> >>  static struct cpuidle_state *cpuidle_state_table;
> >>
> >>  /*
> >> @@ -597,7 +616,7 @@ static struct cpuidle_state skl_cstates[] = {
> >>               .exit_latency = 2,
> >>               .target_residency = 2,
> >>               .enter = &intel_idle,
> >> -             .enter_freeze = intel_idle_freeze, },
> >> +             .enter_freeze = intel_idle_freeze_and_check, },
> >
> > Why do you do this for anything lower than C6?
> 
> In that case, it's probably best the fail in these cases if the check
> is enabled. The CPU can't enter a lower cstate than the hinted one,
> correct?

Yes, it can, but this is based on the hint and not on the entered state. :-)

There is some gray area related to what if the user disabled the deepest state
via sysfs, but other than that the check only needs to be made in the deepest
state's callback (because that's what will be requested in the suspend-to-idle
path).

Thanks,
Rafael
Derek Basehore July 10, 2017, 10:24 p.m. UTC | #5
On Mon, Jul 10, 2017 at 3:09 PM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote:
> On Monday, July 10, 2017 02:57:48 PM dbasehore . wrote:
>> On Mon, Jul 10, 2017 at 6:33 AM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote:
>> > On Friday, July 07, 2017 05:03:03 PM Derek Basehore wrote:
>> >> This adds validation of S0ix entry and enables it on Skylake. Using
>> >> the new tick_set_freeze_event function, we program the CPU to wake up
>> >> X seconds after entering freeze. After X seconds, it will wake the CPU
>> >> to check the S0ix residency counters and make sure we entered the
>> >> lowest power state for suspend-to-idle.
>> >>
>> >> It exits freeze and reports an error to userspace when the SoC does
>> >> not enter S0ix on suspend-to-idle.
>> >>
>> >
>> > Honestly, I'm totally unsure about this ATM, as it seems to assume that it
>> > doesn't make senes to stay suspended if SLP_S0 residency is not there, but
>> > that totally may not be the case.
>> >
>> > First of all, there are systems in which SLP_S0 is related to about 10%-20% of
>> > the total power draw reduction whereas the remaining 80%-90% comes from PC10
>> > alone.  So, if you can get to PC10, being unable to get SLP_S0 residency on top
>> > of that may not be a big deal after all.  Of course, you may argue that 10%-20%
>> > of battery life while suspended is "a lot", but that really depends on the
>> > possible alternatives.
>> >
>>
>> We'd have to track actual PC10 residency instead of checking if it's
>> the requested state since the SoC can enter a higher power package
>> cstate even if PC10 is requested.
>
> That's correct, but it should be sufficient to check the PC10 residency
> (there's some code to do that in turbostat, for example).
>
>> I think this can be done by reading
>> an msr register, though. Is there an example of how PC10 can be
>> entered without SLP_S0 getting asserted by the way?
>
> Yes, there is.
>
> PC10 is a power state of the north complex and it can be entered regardless
> of the SLP_S0 status which is related to the south complex.
>
>> Also, this feature is disabled by default, so it doesn't prevent these
>> use cases.
>>
>> > Second, as far as the alternatives go, it may not be rosy, because there are
>> > systems that don't support S3 (or any other ACPI sleep states at all for that
>> > matter) and suspend-to-idle is the only suspend mechanism available there.
>> > On those systems it still may make sense to use it even though it may not
>> > reduce the power draw that much.  And from some experiments, suspend-to-idle
>> > still extends battery life by 100% over runtime idle even if the system is not
>> > able to get to PC10 most of the time.
>>
>> This is off by default.
>
> Fair enough, but even so it may not be very useful in general as is.
>
>> >
>> > While I understand the use case, I don't think it is a binary "yes"-"no" thing
>> > and the focus on just SLP_S0 may be misguided.
>>
>> Do you have a preference such as being able to set the level that you
>> want to validate to? For instance, there could be an option to check
>> that SLP_So is asserted, but there could also be an option to check
>> for PC9 or PC10 residency. For instance, there could be a module
>> parameters for setting the validated state:
>>
>> available_suspend_to_idle_states:
>> "none pc6 pc9 pc10 slp_s0"
>>
>> max_suspend_to_idle_state:
>> "none"
>>
>> Where the default validated state is none, but it can be set to any of
>> the states in available_suspend_to_idle_states
>
> In the suspend-to-idle path the driver will always request the deepest state
> available (C10 for Skylake) and I would validate the associated package state
> by default plus optionally SLP_S0.

Should package state validation be enabled by default and should the
user be able to disable it?

>
>> >
>> >> One example of a bug that can prevent a Skylake CPU from entering S0ix
>> >> (suspend-to-idle) is a leaked reference count to one of the i915 power
>> >
>> > Suspend-to-idle is not S0ix.  Suspend-to-idle is the state the kernel puts the
>> > system into after echoing "freeze" to /sys/power/state and S0ix is a platform
>> > power state that may or may not be entered as a result of that.
>> >
>> >> wells. The CPU will not be able to enter Package C10 and will
>> >> therefore use about 4x as much power for the entire system. The issue
>> >> is not specific to the i915 power wells though.
>> >
>> > Well, fair enough, but what if the SoC can enter PC10, but without SLP_S0
>> > residency on top of it?
>> >
>> >> Signed-off-by: Derek Basehore <dbasehore@chromium.org>
>> >> ---
>> >>  drivers/idle/intel_idle.c | 142 +++++++++++++++++++++++++++++++++++++++++++---
>> >>  1 file changed, 134 insertions(+), 8 deletions(-)
>> >>
>> >> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
>> >> index ebed3f804291..d38621da6e54 100644
>> >> --- a/drivers/idle/intel_idle.c
>> >> +++ b/drivers/idle/intel_idle.c
>> >> @@ -61,10 +61,12 @@
>> >>  #include <linux/notifier.h>
>> >>  #include <linux/cpu.h>
>> >>  #include <linux/moduleparam.h>
>> >> +#include <linux/suspend.h>
>> >>  #include <asm/cpu_device_id.h>
>> >>  #include <asm/intel-family.h>
>> >>  #include <asm/mwait.h>
>> >>  #include <asm/msr.h>
>> >> +#include <asm/pmc_core.h>
>> >>
>> >>  #define INTEL_IDLE_VERSION "0.4.1"
>> >>
>> >> @@ -93,12 +95,29 @@ struct idle_cpu {
>> >>       bool disable_promotion_to_c1e;
>> >>  };
>> >>
>> >> +/*
>> >> + * The limit for the exponential backoff for the freeze duration. At this point,
>> >> + * power impact is is far from measurable. It's about 3uW based on scaling from
>> >> + * waking up 10 times a second.
>> >> + */
>> >> +#define MAX_SLP_S0_SECONDS 1000
>> >> +#define SLP_S0_EXP_BASE 10
>> >> +
>> >> +static bool slp_s0_check;
>> >> +static unsigned int slp_s0_seconds;
>> >> +
>> >> +static DEFINE_SPINLOCK(slp_s0_check_lock);
>> >> +static unsigned int slp_s0_num_cpus;
>> >> +static bool slp_s0_check_inprogress;
>> >> +
>> >>  static const struct idle_cpu *icpu;
>> >>  static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
>> >>  static int intel_idle(struct cpuidle_device *dev,
>> >>                       struct cpuidle_driver *drv, int index);
>> >>  static int intel_idle_freeze(struct cpuidle_device *dev,
>> >>                            struct cpuidle_driver *drv, int index);
>> >> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
>> >> +                                    struct cpuidle_driver *drv, int index);
>> >>  static struct cpuidle_state *cpuidle_state_table;
>> >>
>> >>  /*
>> >> @@ -597,7 +616,7 @@ static struct cpuidle_state skl_cstates[] = {
>> >>               .exit_latency = 2,
>> >>               .target_residency = 2,
>> >>               .enter = &intel_idle,
>> >> -             .enter_freeze = intel_idle_freeze, },
>> >> +             .enter_freeze = intel_idle_freeze_and_check, },
>> >
>> > Why do you do this for anything lower than C6?
>>
>> In that case, it's probably best the fail in these cases if the check
>> is enabled. The CPU can't enter a lower cstate than the hinted one,
>> correct?
>
> Yes, it can, but this is based on the hint and not on the entered state. :-)
>
> There is some gray area related to what if the user disabled the deepest state
> via sysfs, but other than that the check only needs to be made in the deepest
> state's callback (because that's what will be requested in the suspend-to-idle
> path).
>
> Thanks,
> Rafael
>
Rafael J. Wysocki July 11, 2017, 2:57 p.m. UTC | #6
On Monday, July 10, 2017 03:24:14 PM dbasehore . wrote:
> On Mon, Jul 10, 2017 at 3:09 PM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote:
> > On Monday, July 10, 2017 02:57:48 PM dbasehore . wrote:
> >> On Mon, Jul 10, 2017 at 6:33 AM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote:
> >> > On Friday, July 07, 2017 05:03:03 PM Derek Basehore wrote:
> >> >> This adds validation of S0ix entry and enables it on Skylake. Using
> >> >> the new tick_set_freeze_event function, we program the CPU to wake up
> >> >> X seconds after entering freeze. After X seconds, it will wake the CPU
> >> >> to check the S0ix residency counters and make sure we entered the
> >> >> lowest power state for suspend-to-idle.
> >> >>
> >> >> It exits freeze and reports an error to userspace when the SoC does
> >> >> not enter S0ix on suspend-to-idle.
> >> >>
> >> >
> >> > Honestly, I'm totally unsure about this ATM, as it seems to assume that it
> >> > doesn't make senes to stay suspended if SLP_S0 residency is not there, but
> >> > that totally may not be the case.
> >> >
> >> > First of all, there are systems in which SLP_S0 is related to about 10%-20% of
> >> > the total power draw reduction whereas the remaining 80%-90% comes from PC10
> >> > alone.  So, if you can get to PC10, being unable to get SLP_S0 residency on top
> >> > of that may not be a big deal after all.  Of course, you may argue that 10%-20%
> >> > of battery life while suspended is "a lot", but that really depends on the
> >> > possible alternatives.
> >> >
> >>
> >> We'd have to track actual PC10 residency instead of checking if it's
> >> the requested state since the SoC can enter a higher power package
> >> cstate even if PC10 is requested.
> >
> > That's correct, but it should be sufficient to check the PC10 residency
> > (there's some code to do that in turbostat, for example).
> >
> >> I think this can be done by reading
> >> an msr register, though. Is there an example of how PC10 can be
> >> entered without SLP_S0 getting asserted by the way?
> >
> > Yes, there is.
> >
> > PC10 is a power state of the north complex and it can be entered regardless
> > of the SLP_S0 status which is related to the south complex.
> >
> >> Also, this feature is disabled by default, so it doesn't prevent these
> >> use cases.
> >>
> >> > Second, as far as the alternatives go, it may not be rosy, because there are
> >> > systems that don't support S3 (or any other ACPI sleep states at all for that
> >> > matter) and suspend-to-idle is the only suspend mechanism available there.
> >> > On those systems it still may make sense to use it even though it may not
> >> > reduce the power draw that much.  And from some experiments, suspend-to-idle
> >> > still extends battery life by 100% over runtime idle even if the system is not
> >> > able to get to PC10 most of the time.
> >>
> >> This is off by default.
> >
> > Fair enough, but even so it may not be very useful in general as is.
> >
> >> >
> >> > While I understand the use case, I don't think it is a binary "yes"-"no" thing
> >> > and the focus on just SLP_S0 may be misguided.
> >>
> >> Do you have a preference such as being able to set the level that you
> >> want to validate to? For instance, there could be an option to check
> >> that SLP_So is asserted, but there could also be an option to check
> >> for PC9 or PC10 residency. For instance, there could be a module
> >> parameters for setting the validated state:
> >>
> >> available_suspend_to_idle_states:
> >> "none pc6 pc9 pc10 slp_s0"
> >>
> >> max_suspend_to_idle_state:
> >> "none"
> >>
> >> Where the default validated state is none, but it can be set to any of
> >> the states in available_suspend_to_idle_states
> >
> > In the suspend-to-idle path the driver will always request the deepest state
> > available (C10 for Skylake) and I would validate the associated package state
> > by default plus optionally SLP_S0.
> 
> Should package state validation be enabled by default and should the
> user be able to disable it?

IMO the whole vaildation should depend on a command line option (in case
this is a system without S3 and the user has no choice really), but it should
check the package state residency in the first place.

I guess this means I would make it a "bail out if you can't go as deep as X"
with X possibly equal to "the deepest package state" or "SLP_S0".

Thanks,
Rafael
Len Brown July 11, 2017, 3:43 p.m. UTC | #7
I acknowledge the specific need for this check to assure a great
user-experience on specific hardware.

I also concur the motivation to make mechanisms general and generic so
they can be re-used.

However, it isn't clear to me that this check would be used outside of
some very specific scenarios,
and so we may be trying too hard to make it general, and the code
would be simpler
if we focus on that.  We can always make it more general when we have
more use-cases...
Thomas Gleixner July 12, 2017, 10:16 p.m. UTC | #8
On Fri, 7 Jul 2017, Derek Basehore wrote:
> This adds validation of S0ix entry and enables it on Skylake. Using
> the new tick_set_freeze_event function, we program the CPU to wake up
> X seconds after entering freeze. After X seconds, it will wake the CPU
> to check the S0ix residency counters and make sure we entered the
> lowest power state for suspend-to-idle.
> 
> It exits freeze and reports an error to userspace when the SoC does
> not enter S0ix on suspend-to-idle.
> 
> One example of a bug that can prevent a Skylake CPU from entering S0ix
> (suspend-to-idle) is a leaked reference count to one of the i915 power
> wells. The CPU will not be able to enter Package C10 and will
> therefore use about 4x as much power for the entire system. The issue
> is not specific to the i915 power wells though.

I really have a hard time to understand the usefulness of this.

All I can see so far is detecting that something went wrong. So if this
happens once per day all the user gets is a completely useless line in
dmesg. Even if that message comes more often, it still tells only that
something went wrong without the slightest information about the potential
root cause.

There are more issues with this: If there is a hrtimer scheduled on that
last CPU which enters the idle freeze state and that timer is 10 minutes
away, then the check timer can't be programmed and the system will happily
stay for 10 minutes in some shallow C state without notice. Not really
useful.

You know upfront whether the i915 power wells (or whatever other machinery)
is not powered off to allow the system to enter a specific power state. If
you think hard enough about creating infrastructure which allows you to
register power related facilities and then check them in that idle freeze
enter state, then you get immediate information WHY this happens and not
just the by chance notification about the fact that it happened.

I might be missing something, but aside of the issues I have with the
tick/clockevents abuse, this looks like some half baken ad hoc debugging
aid of dubious value.

Thanks,

	tglx
Derek Basehore July 12, 2017, 11:14 p.m. UTC | #9
On Wed, Jul 12, 2017 at 3:16 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Fri, 7 Jul 2017, Derek Basehore wrote:
>> This adds validation of S0ix entry and enables it on Skylake. Using
>> the new tick_set_freeze_event function, we program the CPU to wake up
>> X seconds after entering freeze. After X seconds, it will wake the CPU
>> to check the S0ix residency counters and make sure we entered the
>> lowest power state for suspend-to-idle.
>>
>> It exits freeze and reports an error to userspace when the SoC does
>> not enter S0ix on suspend-to-idle.
>>
>> One example of a bug that can prevent a Skylake CPU from entering S0ix
>> (suspend-to-idle) is a leaked reference count to one of the i915 power
>> wells. The CPU will not be able to enter Package C10 and will
>> therefore use about 4x as much power for the entire system. The issue
>> is not specific to the i915 power wells though.
>
> I really have a hard time to understand the usefulness of this.
>
> All I can see so far is detecting that something went wrong. So if this
> happens once per day all the user gets is a completely useless line in
> dmesg. Even if that message comes more often, it still tells only that
> something went wrong without the slightest information about the potential
> root cause.
>
> There are more issues with this: If there is a hrtimer scheduled on that
> last CPU which enters the idle freeze state and that timer is 10 minutes
> away, then the check timer can't be programmed and the system will happily
> stay for 10 minutes in some shallow C state without notice. Not really
> useful.

Are hrtimers not suspended after timekeeping_suspend is called?

>
> You know upfront whether the i915 power wells (or whatever other machinery)
> is not powered off to allow the system to enter a specific power state. If
> you think hard enough about creating infrastructure which allows you to
> register power related facilities and then check them in that idle freeze
> enter state, then you get immediate information WHY this happens and not
> just the by chance notification about the fact that it happened.

It's not always something that can be checked by software. There was
one case where an ordering for powering down audio hardware prevented
proper PC10 entry, but there didn't seem to be any way to check that.
Hardware watchdogs also have the same lack of clarity, but most if not
all desktop and mobile processors ship with one. Overall, this seems
to be the best that can be done at this point in freeze, and we can't
really rely on every part of the system properly validating it's state
in its suspend operation.

>
> I might be missing something, but aside of the issues I have with the
> tick/clockevents abuse, this looks like some half baken ad hoc debugging
> aid of dubious value.
>
> Thanks,
>
>         tglx
>
>
>
>
Derek Basehore July 13, 2017, 1:06 a.m. UTC | #10
On Wed, Jul 12, 2017 at 3:16 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Fri, 7 Jul 2017, Derek Basehore wrote:
>> This adds validation of S0ix entry and enables it on Skylake. Using
>> the new tick_set_freeze_event function, we program the CPU to wake up
>> X seconds after entering freeze. After X seconds, it will wake the CPU
>> to check the S0ix residency counters and make sure we entered the
>> lowest power state for suspend-to-idle.
>>
>> It exits freeze and reports an error to userspace when the SoC does
>> not enter S0ix on suspend-to-idle.
>>
>> One example of a bug that can prevent a Skylake CPU from entering S0ix
>> (suspend-to-idle) is a leaked reference count to one of the i915 power
>> wells. The CPU will not be able to enter Package C10 and will
>> therefore use about 4x as much power for the entire system. The issue
>> is not specific to the i915 power wells though.
>
> I really have a hard time to understand the usefulness of this.
>
> All I can see so far is detecting that something went wrong. So if this
> happens once per day all the user gets is a completely useless line in
> dmesg. Even if that message comes more often, it still tells only that
> something went wrong without the slightest information about the potential
> root cause.
>
> There are more issues with this: If there is a hrtimer scheduled on that
> last CPU which enters the idle freeze state and that timer is 10 minutes
> away, then the check timer can't be programmed and the system will happily
> stay for 10 minutes in some shallow C state without notice. Not really
> useful.
>
> You know upfront whether the i915 power wells (or whatever other machinery)
> is not powered off to allow the system to enter a specific power state. If
> you think hard enough about creating infrastructure which allows you to
> register power related facilities and then check them in that idle freeze
> enter state, then you get immediate information WHY this happens and not
> just the by chance notification about the fact that it happened.
>
> I might be missing something, but aside of the issues I have with the
> tick/clockevents abuse, this looks like some half baken ad hoc debugging
> aid of dubious value.

This isn't just for debugging. If we just wanted it for debugging, we
could read the slp s0 counter on resume. This is also to fail suspend
to idle fast so it can be tried again (or just shutdown the machine)
when a user closes the machine for the weekend.

>
> Thanks,
>
>         tglx
>
>
>
>
Thomas Gleixner July 13, 2017, 5:11 a.m. UTC | #11
On Wed, 12 Jul 2017, dbasehore . wrote:
> On Wed, Jul 12, 2017 at 3:16 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> > There are more issues with this: If there is a hrtimer scheduled on that
> > last CPU which enters the idle freeze state and that timer is 10 minutes
> > away, then the check timer can't be programmed and the system will happily
> > stay for 10 minutes in some shallow C state without notice. Not really
> > useful.
> 
> Are hrtimers not suspended after timekeeping_suspend is called?

They are. As I said I forgot about the inner workings and that check for
state != shutdown confused me even more, as it just looked like this might
be a valid state.

> > You know upfront whether the i915 power wells (or whatever other machinery)
> > is not powered off to allow the system to enter a specific power state. If
> > you think hard enough about creating infrastructure which allows you to
> > register power related facilities and then check them in that idle freeze
> > enter state, then you get immediate information WHY this happens and not
> > just the by chance notification about the fact that it happened.
> 
> It's not always something that can be checked by software. There was
> one case where an ordering for powering down audio hardware prevented
> proper PC10 entry, but there didn't seem to be any way to check that.
> Hardware watchdogs also have the same lack of clarity, but most if not
> all desktop and mobile processors ship with one. Overall, this seems
> to be the best that can be done at this point in freeze, and we can't
> really rely on every part of the system properly validating it's state
> in its suspend operation.

So if I understand correctly, this is the last resort of catching problems
which can't be detected upfront or are caused by a software bug.

I'm fine with that, but please explain and document it proper. The current
explanation is confusing at best.

Thanks,

	tglx
Derek Basehore July 13, 2017, 10:49 p.m. UTC | #12
On Wed, Jul 12, 2017 at 10:11 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
> On Wed, 12 Jul 2017, dbasehore . wrote:
>> On Wed, Jul 12, 2017 at 3:16 PM, Thomas Gleixner <tglx@linutronix.de> wrote:
>> > There are more issues with this: If there is a hrtimer scheduled on that
>> > last CPU which enters the idle freeze state and that timer is 10 minutes
>> > away, then the check timer can't be programmed and the system will happily
>> > stay for 10 minutes in some shallow C state without notice. Not really
>> > useful.
>>
>> Are hrtimers not suspended after timekeeping_suspend is called?
>
> They are. As I said I forgot about the inner workings and that check for
> state != shutdown confused me even more, as it just looked like this might
> be a valid state.

Okay. I'll add a comment to clarify this part.

>
>> > You know upfront whether the i915 power wells (or whatever other machinery)
>> > is not powered off to allow the system to enter a specific power state. If
>> > you think hard enough about creating infrastructure which allows you to
>> > register power related facilities and then check them in that idle freeze
>> > enter state, then you get immediate information WHY this happens and not
>> > just the by chance notification about the fact that it happened.
>>
>> It's not always something that can be checked by software. There was
>> one case where an ordering for powering down audio hardware prevented
>> proper PC10 entry, but there didn't seem to be any way to check that.
>> Hardware watchdogs also have the same lack of clarity, but most if not
>> all desktop and mobile processors ship with one. Overall, this seems
>> to be the best that can be done at this point in freeze, and we can't
>> really rely on every part of the system properly validating it's state
>> in its suspend operation.
>
> So if I understand correctly, this is the last resort of catching problems
> which can't be detected upfront or are caused by a software bug.
>
> I'm fine with that, but please explain and document it proper. The current
> explanation is confusing at best.

Will do.

>
> Thanks,
>
>         tglx
diff mbox

Patch

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index ebed3f804291..d38621da6e54 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -61,10 +61,12 @@ 
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/moduleparam.h>
+#include <linux/suspend.h>
 #include <asm/cpu_device_id.h>
 #include <asm/intel-family.h>
 #include <asm/mwait.h>
 #include <asm/msr.h>
+#include <asm/pmc_core.h>
 
 #define INTEL_IDLE_VERSION "0.4.1"
 
@@ -93,12 +95,29 @@  struct idle_cpu {
 	bool disable_promotion_to_c1e;
 };
 
+/*
+ * The limit for the exponential backoff for the freeze duration. At this point,
+ * power impact is is far from measurable. It's about 3uW based on scaling from
+ * waking up 10 times a second.
+ */
+#define MAX_SLP_S0_SECONDS 1000
+#define SLP_S0_EXP_BASE 10
+
+static bool slp_s0_check;
+static unsigned int slp_s0_seconds;
+
+static DEFINE_SPINLOCK(slp_s0_check_lock);
+static unsigned int slp_s0_num_cpus;
+static bool slp_s0_check_inprogress;
+
 static const struct idle_cpu *icpu;
 static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
 static int intel_idle(struct cpuidle_device *dev,
 			struct cpuidle_driver *drv, int index);
 static int intel_idle_freeze(struct cpuidle_device *dev,
 			     struct cpuidle_driver *drv, int index);
+static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
+				       struct cpuidle_driver *drv, int index);
 static struct cpuidle_state *cpuidle_state_table;
 
 /*
@@ -597,7 +616,7 @@  static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 2,
 		.target_residency = 2,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_freeze = intel_idle_freeze_and_check, },
 	{
 		.name = "C1E",
 		.desc = "MWAIT 0x01",
@@ -605,7 +624,7 @@  static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 10,
 		.target_residency = 20,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_freeze = intel_idle_freeze_and_check, },
 	{
 		.name = "C3",
 		.desc = "MWAIT 0x10",
@@ -613,7 +632,7 @@  static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 70,
 		.target_residency = 100,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_freeze = intel_idle_freeze_and_check, },
 	{
 		.name = "C6",
 		.desc = "MWAIT 0x20",
@@ -621,7 +640,7 @@  static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 85,
 		.target_residency = 200,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_freeze = intel_idle_freeze_and_check, },
 	{
 		.name = "C7s",
 		.desc = "MWAIT 0x33",
@@ -629,7 +648,7 @@  static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 124,
 		.target_residency = 800,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_freeze = intel_idle_freeze_and_check, },
 	{
 		.name = "C8",
 		.desc = "MWAIT 0x40",
@@ -637,7 +656,7 @@  static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 200,
 		.target_residency = 800,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_freeze = intel_idle_freeze_and_check, },
 	{
 		.name = "C9",
 		.desc = "MWAIT 0x50",
@@ -645,7 +664,7 @@  static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 480,
 		.target_residency = 5000,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_freeze = intel_idle_freeze_and_check, },
 	{
 		.name = "C10",
 		.desc = "MWAIT 0x60",
@@ -653,7 +672,7 @@  static struct cpuidle_state skl_cstates[] = {
 		.exit_latency = 890,
 		.target_residency = 5000,
 		.enter = &intel_idle,
-		.enter_freeze = intel_idle_freeze, },
+		.enter_freeze = intel_idle_freeze_and_check, },
 	{
 		.enter = NULL }
 };
@@ -940,6 +959,8 @@  static __cpuidle int intel_idle(struct cpuidle_device *dev,
  * @dev: cpuidle_device
  * @drv: cpuidle driver
  * @index: state index
+ *
+ * @return 0 for success, no failure state
  */
 static int intel_idle_freeze(struct cpuidle_device *dev,
 			     struct cpuidle_driver *drv, int index)
@@ -952,6 +973,101 @@  static int intel_idle_freeze(struct cpuidle_device *dev,
 	return 0;
 }
 
+static int check_slp_s0(u32 slp_s0_saved_count)
+{
+	u32 slp_s0_new_count;
+
+	if (intel_pmc_slp_s0_counter_read(&slp_s0_new_count)) {
+		pr_warn("Unable to read SLP S0 residency counter\n");
+		return -EIO;
+	}
+
+	if (slp_s0_saved_count == slp_s0_new_count) {
+		pr_warn("CPU did not enter SLP S0 for suspend-to-idle.\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * intel_idle_freeze_and_check - enters suspend-to-idle and validates the power
+ * state
+ *
+ * This function enters suspend-to-idle with intel_idle_freeze, but also sets up
+ * a timer to check that S0ix (low power state for suspend-to-idle on Intel
+ * CPUs) is properly entered.
+ *
+ * @dev: cpuidle_device
+ * @drv: cpuidle_driver
+ * @index: state index
+ * @return 0 for success, -EERROR if S0ix was not entered.
+ */
+static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
+				       struct cpuidle_driver *drv, int index)
+{
+	bool check_on_this_cpu = false;
+	u32 slp_s0_saved_count;
+	unsigned long flags;
+	int cpu = smp_processor_id();
+	int ret;
+
+	/* The last CPU to freeze sets up checking SLP S0 assertion. */
+	spin_lock_irqsave(&slp_s0_check_lock, flags);
+	slp_s0_num_cpus++;
+	if (slp_s0_seconds &&
+	    slp_s0_num_cpus == num_online_cpus() &&
+	    !slp_s0_check_inprogress &&
+	    !intel_pmc_slp_s0_counter_read(&slp_s0_saved_count)) {
+		ret = tick_set_freeze_event(cpu, ktime_set(slp_s0_seconds, 0));
+		if (ret < 0) {
+			spin_unlock_irqrestore(&slp_s0_check_lock, flags);
+			goto out;
+		}
+
+		/*
+		 * Make sure check_slp_s0 isn't scheduled on another CPU if it
+		 * were to leave freeze and enter it again before this CPU
+		 * leaves freeze.
+		 */
+		slp_s0_check_inprogress = true;
+		check_on_this_cpu = true;
+	}
+	spin_unlock_irqrestore(&slp_s0_check_lock, flags);
+
+	ret = intel_idle_freeze(dev, drv, index);
+	if (ret < 0)
+		goto out;
+
+	if (check_on_this_cpu && tick_clear_freeze_event(cpu))
+		ret = check_slp_s0(slp_s0_saved_count);
+
+out:
+	spin_lock_irqsave(&slp_s0_check_lock, flags);
+	if (check_on_this_cpu) {
+		slp_s0_check_inprogress = false;
+		slp_s0_seconds = min_t(unsigned int,
+				       SLP_S0_EXP_BASE * slp_s0_seconds,
+				       MAX_SLP_S0_SECONDS);
+	}
+	slp_s0_num_cpus--;
+	spin_unlock_irqrestore(&slp_s0_check_lock, flags);
+	return ret;
+}
+
+static int slp_s0_check_prepare(struct notifier_block *nb, unsigned long action,
+				void *data)
+{
+	if (action == PM_SUSPEND_PREPARE)
+		slp_s0_seconds = slp_s0_check ? 1 : 0;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block intel_slp_s0_check_nb = {
+	.notifier_call = slp_s0_check_prepare,
+};
+
 static void __setup_broadcast_timer(bool on)
 {
 	if (on)
@@ -1454,6 +1570,13 @@  static int __init intel_idle_init(void)
 		goto init_driver_fail;
 	}
 
+	retval = register_pm_notifier(&intel_slp_s0_check_nb);
+	if (retval) {
+		free_percpu(intel_idle_cpuidle_devices);
+		cpuidle_unregister_driver(&intel_idle_driver);
+		goto pm_nb_fail;
+	}
+
 	if (boot_cpu_has(X86_FEATURE_ARAT))	/* Always Reliable APIC Timer */
 		lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
 
@@ -1469,6 +1592,8 @@  static int __init intel_idle_init(void)
 
 hp_setup_fail:
 	intel_idle_cpuidle_devices_uninit();
+	unregister_pm_notifier(&intel_slp_s0_check_nb);
+pm_nb_fail:
 	cpuidle_unregister_driver(&intel_idle_driver);
 init_driver_fail:
 	free_percpu(intel_idle_cpuidle_devices);
@@ -1484,3 +1609,4 @@  device_initcall(intel_idle_init);
  * is the easiest way (currently) to continue doing that.
  */
 module_param(max_cstate, int, 0444);
+module_param(slp_s0_check, bool, 0644);