Message ID | 20170708000303.21863-5-dbasehore@chromium.org (mailing list archive) |
---|---|
State | RFC, archived |
Headers | show |
Hi Derek, [auto build test ERROR on pm/linux-next] [also build test ERROR on next-20170707] [cannot apply to v4.12] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Derek-Basehore/x86-stub-out-pmc-function/20170709-134714 base: https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git linux-next config: x86_64-randconfig-x015-201728 (attached as .config) compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901 reproduce: # save the attached .config to linux build tree make ARCH=x86_64 All errors (new ones prefixed by >>): drivers/idle/intel_idle.c: In function 'intel_idle_freeze_and_check': >> drivers/idle/intel_idle.c:1022:9: error: implicit declaration of function 'tick_set_freeze_event' [-Werror=implicit-function-declaration] ret = tick_set_freeze_event(cpu, ktime_set(slp_s0_seconds, 0)); ^~~~~~~~~~~~~~~~~~~~~ >> drivers/idle/intel_idle.c:1042:27: error: implicit declaration of function 'tick_clear_freeze_event' [-Werror=implicit-function-declaration] if (check_on_this_cpu && tick_clear_freeze_event(cpu)) ^~~~~~~~~~~~~~~~~~~~~~~ Cyclomatic Complexity 2 arch/x86/include/asm/bitops.h:set_bit Cyclomatic Complexity 2 arch/x86/include/asm/bitops.h:clear_bit Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:constant_test_bit Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:variable_test_bit Cyclomatic Complexity 1 arch/x86/include/asm/current.h:get_current Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:__cpuid Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:paravirt_read_msr Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:paravirt_write_msr Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:wrmsrl Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:arch_local_save_flags Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:arch_local_irq_restore Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:arch_local_irq_disable Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:arch_local_irq_save Cyclomatic Complexity 1 arch/x86/include/asm/special_insns.h:clflush Cyclomatic Complexity 1 include/linux/math64.h:div64_u64 Cyclomatic Complexity 1 arch/x86/include/asm/processor.h:cpuid Cyclomatic Complexity 3 arch/x86/include/asm/cpufeature.h:_static_cpu_has Cyclomatic Complexity 1 include/linux/thread_info.h:set_ti_thread_flag Cyclomatic Complexity 1 include/linux/thread_info.h:clear_ti_thread_flag Cyclomatic Complexity 2 include/linux/thread_info.h:test_ti_thread_flag Cyclomatic Complexity 1 arch/x86/include/asm/preempt.h:set_preempt_need_resched Cyclomatic Complexity 5 arch/x86/include/asm/preempt.h:__preempt_count_add Cyclomatic Complexity 5 arch/x86/include/asm/preempt.h:__preempt_count_sub Cyclomatic Complexity 1 include/linux/spinlock.h:spinlock_check Cyclomatic Complexity 1 include/linux/spinlock.h:spin_unlock_irqrestore Cyclomatic Complexity 2 include/linux/ktime.h:ktime_set Cyclomatic Complexity 1 include/linux/sched.h:need_resched Cyclomatic Complexity 1 include/linux/tick.h:tick_broadcast_enable Cyclomatic Complexity 1 include/linux/tick.h:tick_broadcast_disable Cyclomatic Complexity 1 include/linux/tick.h:tick_broadcast_enter Cyclomatic Complexity 1 include/linux/tick.h:tick_broadcast_exit Cyclomatic Complexity 1 arch/x86/include/asm/mmu.h:leave_mm Cyclomatic Complexity 1 include/linux/cpuhotplug.h:cpuhp_setup_state Cyclomatic Complexity 1 include/linux/suspend.h:register_pm_notifier Cyclomatic Complexity 1 include/linux/suspend.h:unregister_pm_notifier Cyclomatic Complexity 1 include/linux/sched/idle.h:__current_set_polling Cyclomatic Complexity 1 include/linux/sched/idle.h:current_set_polling_and_test Cyclomatic Complexity 1 include/linux/sched/idle.h:__current_clr_polling Cyclomatic Complexity 2 include/linux/sched/idle.h:current_clr_polling Cyclomatic Complexity 1 arch/x86/include/asm/mwait.h:__monitor Cyclomatic Complexity 1 arch/x86/include/asm/mwait.h:__mwait Cyclomatic Complexity 5 arch/x86/include/asm/mwait.h:mwait_idle_with_hints Cyclomatic Complexity 4 drivers/idle/intel_idle.c:intel_idle Cyclomatic Complexity 1 drivers/idle/intel_idle.c:intel_idle_freeze Cyclomatic Complexity 2 drivers/idle/intel_idle.c:slp_s0_check_prepare Cyclomatic Complexity 2 drivers/idle/intel_idle.c:__setup_broadcast_timer Cyclomatic Complexity 1 drivers/idle/intel_idle.c:auto_demotion_disable Cyclomatic Complexity 1 drivers/idle/intel_idle.c:c1e_promotion_disable Cyclomatic Complexity 5 drivers/idle/intel_idle.c:ivt_idle_state_table_update Cyclomatic Complexity 2 drivers/idle/intel_idle.c:irtl_2_usec Cyclomatic Complexity 6 drivers/idle/intel_idle.c:bxt_idle_state_table_update Cyclomatic Complexity 6 drivers/idle/intel_idle.c:sklh_idle_state_table_update Cyclomatic Complexity 4 drivers/idle/intel_idle.c:intel_idle_state_table_update Cyclomatic Complexity 4 drivers/idle/intel_idle.c:intel_idle_cpu_init Cyclomatic Complexity 3 drivers/idle/intel_idle.c:intel_idle_cpu_online Cyclomatic Complexity 7 drivers/idle/intel_idle.c:intel_idle_probe Cyclomatic Complexity 3 drivers/idle/intel_idle.c:check_slp_s0 Cyclomatic Complexity 10 drivers/idle/intel_idle.c:intel_idle_freeze_and_check Cyclomatic Complexity 10 drivers/idle/intel_idle.c:intel_idle_cpuidle_driver_init Cyclomatic Complexity 2 drivers/idle/intel_idle.c:intel_idle_cpuidle_devices_uninit Cyclomatic Complexity 9 drivers/idle/intel_idle.c:intel_idle_init cc1: some warnings being treated as errors vim +/tick_set_freeze_event +1022 drivers/idle/intel_idle.c 1016 spin_lock_irqsave(&slp_s0_check_lock, flags); 1017 slp_s0_num_cpus++; 1018 if (slp_s0_seconds && 1019 slp_s0_num_cpus == num_online_cpus() && 1020 !slp_s0_check_inprogress && 1021 !intel_pmc_slp_s0_counter_read(&slp_s0_saved_count)) { > 1022 ret = tick_set_freeze_event(cpu, ktime_set(slp_s0_seconds, 0)); 1023 if (ret < 0) { 1024 spin_unlock_irqrestore(&slp_s0_check_lock, flags); 1025 goto out; 1026 } 1027 1028 /* 1029 * Make sure check_slp_s0 isn't scheduled on another CPU if it 1030 * were to leave freeze and enter it again before this CPU 1031 * leaves freeze. 1032 */ 1033 slp_s0_check_inprogress = true; 1034 check_on_this_cpu = true; 1035 } 1036 spin_unlock_irqrestore(&slp_s0_check_lock, flags); 1037 1038 ret = intel_idle_freeze(dev, drv, index); 1039 if (ret < 0) 1040 goto out; 1041 > 1042 if (check_on_this_cpu && tick_clear_freeze_event(cpu)) 1043 ret = check_slp_s0(slp_s0_saved_count); 1044 1045 out: --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
On Friday, July 07, 2017 05:03:03 PM Derek Basehore wrote: > This adds validation of S0ix entry and enables it on Skylake. Using > the new tick_set_freeze_event function, we program the CPU to wake up > X seconds after entering freeze. After X seconds, it will wake the CPU > to check the S0ix residency counters and make sure we entered the > lowest power state for suspend-to-idle. > > It exits freeze and reports an error to userspace when the SoC does > not enter S0ix on suspend-to-idle. > Honestly, I'm totally unsure about this ATM, as it seems to assume that it doesn't make senes to stay suspended if SLP_S0 residency is not there, but that totally may not be the case. First of all, there are systems in which SLP_S0 is related to about 10%-20% of the total power draw reduction whereas the remaining 80%-90% comes from PC10 alone. So, if you can get to PC10, being unable to get SLP_S0 residency on top of that may not be a big deal after all. Of course, you may argue that 10%-20% of battery life while suspended is "a lot", but that really depends on the possible alternatives. Second, as far as the alternatives go, it may not be rosy, because there are systems that don't support S3 (or any other ACPI sleep states at all for that matter) and suspend-to-idle is the only suspend mechanism available there. On those systems it still may make sense to use it even though it may not reduce the power draw that much. And from some experiments, suspend-to-idle still extends battery life by 100% over runtime idle even if the system is not able to get to PC10 most of the time. While I understand the use case, I don't think it is a binary "yes"-"no" thing and the focus on just SLP_S0 may be misguided. > One example of a bug that can prevent a Skylake CPU from entering S0ix > (suspend-to-idle) is a leaked reference count to one of the i915 power Suspend-to-idle is not S0ix. Suspend-to-idle is the state the kernel puts the system into after echoing "freeze" to /sys/power/state and S0ix is a platform power state that may or may not be entered as a result of that. > wells. The CPU will not be able to enter Package C10 and will > therefore use about 4x as much power for the entire system. The issue > is not specific to the i915 power wells though. Well, fair enough, but what if the SoC can enter PC10, but without SLP_S0 residency on top of it? > Signed-off-by: Derek Basehore <dbasehore@chromium.org> > --- > drivers/idle/intel_idle.c | 142 +++++++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 134 insertions(+), 8 deletions(-) > > diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c > index ebed3f804291..d38621da6e54 100644 > --- a/drivers/idle/intel_idle.c > +++ b/drivers/idle/intel_idle.c > @@ -61,10 +61,12 @@ > #include <linux/notifier.h> > #include <linux/cpu.h> > #include <linux/moduleparam.h> > +#include <linux/suspend.h> > #include <asm/cpu_device_id.h> > #include <asm/intel-family.h> > #include <asm/mwait.h> > #include <asm/msr.h> > +#include <asm/pmc_core.h> > > #define INTEL_IDLE_VERSION "0.4.1" > > @@ -93,12 +95,29 @@ struct idle_cpu { > bool disable_promotion_to_c1e; > }; > > +/* > + * The limit for the exponential backoff for the freeze duration. At this point, > + * power impact is is far from measurable. It's about 3uW based on scaling from > + * waking up 10 times a second. > + */ > +#define MAX_SLP_S0_SECONDS 1000 > +#define SLP_S0_EXP_BASE 10 > + > +static bool slp_s0_check; > +static unsigned int slp_s0_seconds; > + > +static DEFINE_SPINLOCK(slp_s0_check_lock); > +static unsigned int slp_s0_num_cpus; > +static bool slp_s0_check_inprogress; > + > static const struct idle_cpu *icpu; > static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; > static int intel_idle(struct cpuidle_device *dev, > struct cpuidle_driver *drv, int index); > static int intel_idle_freeze(struct cpuidle_device *dev, > struct cpuidle_driver *drv, int index); > +static int intel_idle_freeze_and_check(struct cpuidle_device *dev, > + struct cpuidle_driver *drv, int index); > static struct cpuidle_state *cpuidle_state_table; > > /* > @@ -597,7 +616,7 @@ static struct cpuidle_state skl_cstates[] = { > .exit_latency = 2, > .target_residency = 2, > .enter = &intel_idle, > - .enter_freeze = intel_idle_freeze, }, > + .enter_freeze = intel_idle_freeze_and_check, }, Why do you do this for anything lower than C6? > { > .name = "C1E", > .desc = "MWAIT 0x01", > @@ -605,7 +624,7 @@ static struct cpuidle_state skl_cstates[] = { > .exit_latency = 10, > .target_residency = 20, > .enter = &intel_idle, > - .enter_freeze = intel_idle_freeze, }, > + .enter_freeze = intel_idle_freeze_and_check, }, > { > .name = "C3", > .desc = "MWAIT 0x10", > @@ -613,7 +632,7 @@ static struct cpuidle_state skl_cstates[] = { > .exit_latency = 70, > .target_residency = 100, > .enter = &intel_idle, > - .enter_freeze = intel_idle_freeze, }, > + .enter_freeze = intel_idle_freeze_and_check, }, > { > .name = "C6", > .desc = "MWAIT 0x20", > @@ -621,7 +640,7 @@ static struct cpuidle_state skl_cstates[] = { > .exit_latency = 85, > .target_residency = 200, > .enter = &intel_idle, > - .enter_freeze = intel_idle_freeze, }, > + .enter_freeze = intel_idle_freeze_and_check, }, > { > .name = "C7s", > .desc = "MWAIT 0x33", > @@ -629,7 +648,7 @@ static struct cpuidle_state skl_cstates[] = { > .exit_latency = 124, > .target_residency = 800, > .enter = &intel_idle, > - .enter_freeze = intel_idle_freeze, }, > + .enter_freeze = intel_idle_freeze_and_check, }, > { > .name = "C8", > .desc = "MWAIT 0x40", > @@ -637,7 +656,7 @@ static struct cpuidle_state skl_cstates[] = { > .exit_latency = 200, > .target_residency = 800, > .enter = &intel_idle, > - .enter_freeze = intel_idle_freeze, }, > + .enter_freeze = intel_idle_freeze_and_check, }, > { > .name = "C9", > .desc = "MWAIT 0x50", > @@ -645,7 +664,7 @@ static struct cpuidle_state skl_cstates[] = { > .exit_latency = 480, > .target_residency = 5000, > .enter = &intel_idle, > - .enter_freeze = intel_idle_freeze, }, > + .enter_freeze = intel_idle_freeze_and_check, }, > { > .name = "C10", > .desc = "MWAIT 0x60", > @@ -653,7 +672,7 @@ static struct cpuidle_state skl_cstates[] = { > .exit_latency = 890, > .target_residency = 5000, > .enter = &intel_idle, > - .enter_freeze = intel_idle_freeze, }, > + .enter_freeze = intel_idle_freeze_and_check, }, > { > .enter = NULL } > }; > @@ -940,6 +959,8 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev, > * @dev: cpuidle_device > * @drv: cpuidle driver > * @index: state index > + * > + * @return 0 for success, no failure state > */ > static int intel_idle_freeze(struct cpuidle_device *dev, > struct cpuidle_driver *drv, int index) > @@ -952,6 +973,101 @@ static int intel_idle_freeze(struct cpuidle_device *dev, > return 0; > } > > +static int check_slp_s0(u32 slp_s0_saved_count) > +{ > + u32 slp_s0_new_count; > + > + if (intel_pmc_slp_s0_counter_read(&slp_s0_new_count)) { > + pr_warn("Unable to read SLP S0 residency counter\n"); > + return -EIO; > + } > + > + if (slp_s0_saved_count == slp_s0_new_count) { > + pr_warn("CPU did not enter SLP S0 for suspend-to-idle.\n"); > + return -EIO; > + } > + > + return 0; > +} > + > +/** > + * intel_idle_freeze_and_check - enters suspend-to-idle and validates the power > + * state > + * > + * This function enters suspend-to-idle with intel_idle_freeze, but also sets up > + * a timer to check that S0ix (low power state for suspend-to-idle on Intel > + * CPUs) is properly entered. > + * > + * @dev: cpuidle_device > + * @drv: cpuidle_driver > + * @index: state index > + * @return 0 for success, -EERROR if S0ix was not entered. > + */ > +static int intel_idle_freeze_and_check(struct cpuidle_device *dev, > + struct cpuidle_driver *drv, int index) > +{ > + bool check_on_this_cpu = false; > + u32 slp_s0_saved_count; > + unsigned long flags; > + int cpu = smp_processor_id(); > + int ret; > + > + /* The last CPU to freeze sets up checking SLP S0 assertion. */ > + spin_lock_irqsave(&slp_s0_check_lock, flags); > + slp_s0_num_cpus++; > + if (slp_s0_seconds && > + slp_s0_num_cpus == num_online_cpus() && > + !slp_s0_check_inprogress && > + !intel_pmc_slp_s0_counter_read(&slp_s0_saved_count)) { > + ret = tick_set_freeze_event(cpu, ktime_set(slp_s0_seconds, 0)); > + if (ret < 0) { > + spin_unlock_irqrestore(&slp_s0_check_lock, flags); > + goto out; > + } > + > + /* > + * Make sure check_slp_s0 isn't scheduled on another CPU if it > + * were to leave freeze and enter it again before this CPU > + * leaves freeze. > + */ > + slp_s0_check_inprogress = true; > + check_on_this_cpu = true; > + } > + spin_unlock_irqrestore(&slp_s0_check_lock, flags); > + > + ret = intel_idle_freeze(dev, drv, index); > + if (ret < 0) > + goto out; > + > + if (check_on_this_cpu && tick_clear_freeze_event(cpu)) > + ret = check_slp_s0(slp_s0_saved_count); > + > +out: > + spin_lock_irqsave(&slp_s0_check_lock, flags); > + if (check_on_this_cpu) { > + slp_s0_check_inprogress = false; > + slp_s0_seconds = min_t(unsigned int, > + SLP_S0_EXP_BASE * slp_s0_seconds, > + MAX_SLP_S0_SECONDS); > + } > + slp_s0_num_cpus--; > + spin_unlock_irqrestore(&slp_s0_check_lock, flags); > + return ret; > +} > + > +static int slp_s0_check_prepare(struct notifier_block *nb, unsigned long action, > + void *data) > +{ > + if (action == PM_SUSPEND_PREPARE) > + slp_s0_seconds = slp_s0_check ? 1 : 0; > + > + return NOTIFY_DONE; > +} > + > +static struct notifier_block intel_slp_s0_check_nb = { > + .notifier_call = slp_s0_check_prepare, > +}; > + > static void __setup_broadcast_timer(bool on) > { > if (on) > @@ -1454,6 +1570,13 @@ static int __init intel_idle_init(void) > goto init_driver_fail; > } > > + retval = register_pm_notifier(&intel_slp_s0_check_nb); > + if (retval) { > + free_percpu(intel_idle_cpuidle_devices); > + cpuidle_unregister_driver(&intel_idle_driver); > + goto pm_nb_fail; > + } > + > if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ > lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; > > @@ -1469,6 +1592,8 @@ static int __init intel_idle_init(void) > > hp_setup_fail: > intel_idle_cpuidle_devices_uninit(); > + unregister_pm_notifier(&intel_slp_s0_check_nb); > +pm_nb_fail: > cpuidle_unregister_driver(&intel_idle_driver); > init_driver_fail: > free_percpu(intel_idle_cpuidle_devices); > @@ -1484,3 +1609,4 @@ device_initcall(intel_idle_init); > * is the easiest way (currently) to continue doing that. > */ > module_param(max_cstate, int, 0444); > +module_param(slp_s0_check, bool, 0644); This has to be documented somehow. Also, if it is not set, there is a useless overhead every time intel_idle_freeze_and_check() is called. It looks like you could use a static key or similar to avoid that. Moreover, the notifier is not necessary then as well. Thanks, Rafael
On Mon, Jul 10, 2017 at 6:33 AM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote: > On Friday, July 07, 2017 05:03:03 PM Derek Basehore wrote: >> This adds validation of S0ix entry and enables it on Skylake. Using >> the new tick_set_freeze_event function, we program the CPU to wake up >> X seconds after entering freeze. After X seconds, it will wake the CPU >> to check the S0ix residency counters and make sure we entered the >> lowest power state for suspend-to-idle. >> >> It exits freeze and reports an error to userspace when the SoC does >> not enter S0ix on suspend-to-idle. >> > > Honestly, I'm totally unsure about this ATM, as it seems to assume that it > doesn't make senes to stay suspended if SLP_S0 residency is not there, but > that totally may not be the case. > > First of all, there are systems in which SLP_S0 is related to about 10%-20% of > the total power draw reduction whereas the remaining 80%-90% comes from PC10 > alone. So, if you can get to PC10, being unable to get SLP_S0 residency on top > of that may not be a big deal after all. Of course, you may argue that 10%-20% > of battery life while suspended is "a lot", but that really depends on the > possible alternatives. > We'd have to track actual PC10 residency instead of checking if it's the requested state since the SoC can enter a higher power package cstate even if PC10 is requested. I think this can be done by reading an msr register, though. Is there an example of how PC10 can be entered without SLP_S0 getting asserted by the way? Also, this feature is disabled by default, so it doesn't prevent these use cases. > Second, as far as the alternatives go, it may not be rosy, because there are > systems that don't support S3 (or any other ACPI sleep states at all for that > matter) and suspend-to-idle is the only suspend mechanism available there. > On those systems it still may make sense to use it even though it may not > reduce the power draw that much. And from some experiments, suspend-to-idle > still extends battery life by 100% over runtime idle even if the system is not > able to get to PC10 most of the time. This is off by default. > > While I understand the use case, I don't think it is a binary "yes"-"no" thing > and the focus on just SLP_S0 may be misguided. Do you have a preference such as being able to set the level that you want to validate to? For instance, there could be an option to check that SLP_So is asserted, but there could also be an option to check for PC9 or PC10 residency. For instance, there could be a module parameters for setting the validated state: available_suspend_to_idle_states: "none pc6 pc9 pc10 slp_s0" max_suspend_to_idle_state: "none" Where the default validated state is none, but it can be set to any of the states in available_suspend_to_idle_states > >> One example of a bug that can prevent a Skylake CPU from entering S0ix >> (suspend-to-idle) is a leaked reference count to one of the i915 power > > Suspend-to-idle is not S0ix. Suspend-to-idle is the state the kernel puts the > system into after echoing "freeze" to /sys/power/state and S0ix is a platform > power state that may or may not be entered as a result of that. > >> wells. The CPU will not be able to enter Package C10 and will >> therefore use about 4x as much power for the entire system. The issue >> is not specific to the i915 power wells though. > > Well, fair enough, but what if the SoC can enter PC10, but without SLP_S0 > residency on top of it? > >> Signed-off-by: Derek Basehore <dbasehore@chromium.org> >> --- >> drivers/idle/intel_idle.c | 142 +++++++++++++++++++++++++++++++++++++++++++--- >> 1 file changed, 134 insertions(+), 8 deletions(-) >> >> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c >> index ebed3f804291..d38621da6e54 100644 >> --- a/drivers/idle/intel_idle.c >> +++ b/drivers/idle/intel_idle.c >> @@ -61,10 +61,12 @@ >> #include <linux/notifier.h> >> #include <linux/cpu.h> >> #include <linux/moduleparam.h> >> +#include <linux/suspend.h> >> #include <asm/cpu_device_id.h> >> #include <asm/intel-family.h> >> #include <asm/mwait.h> >> #include <asm/msr.h> >> +#include <asm/pmc_core.h> >> >> #define INTEL_IDLE_VERSION "0.4.1" >> >> @@ -93,12 +95,29 @@ struct idle_cpu { >> bool disable_promotion_to_c1e; >> }; >> >> +/* >> + * The limit for the exponential backoff for the freeze duration. At this point, >> + * power impact is is far from measurable. It's about 3uW based on scaling from >> + * waking up 10 times a second. >> + */ >> +#define MAX_SLP_S0_SECONDS 1000 >> +#define SLP_S0_EXP_BASE 10 >> + >> +static bool slp_s0_check; >> +static unsigned int slp_s0_seconds; >> + >> +static DEFINE_SPINLOCK(slp_s0_check_lock); >> +static unsigned int slp_s0_num_cpus; >> +static bool slp_s0_check_inprogress; >> + >> static const struct idle_cpu *icpu; >> static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; >> static int intel_idle(struct cpuidle_device *dev, >> struct cpuidle_driver *drv, int index); >> static int intel_idle_freeze(struct cpuidle_device *dev, >> struct cpuidle_driver *drv, int index); >> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev, >> + struct cpuidle_driver *drv, int index); >> static struct cpuidle_state *cpuidle_state_table; >> >> /* >> @@ -597,7 +616,7 @@ static struct cpuidle_state skl_cstates[] = { >> .exit_latency = 2, >> .target_residency = 2, >> .enter = &intel_idle, >> - .enter_freeze = intel_idle_freeze, }, >> + .enter_freeze = intel_idle_freeze_and_check, }, > > Why do you do this for anything lower than C6? In that case, it's probably best the fail in these cases if the check is enabled. The CPU can't enter a lower cstate than the hinted one, correct? > >> { >> .name = "C1E", >> .desc = "MWAIT 0x01", >> @@ -605,7 +624,7 @@ static struct cpuidle_state skl_cstates[] = { >> .exit_latency = 10, >> .target_residency = 20, >> .enter = &intel_idle, >> - .enter_freeze = intel_idle_freeze, }, >> + .enter_freeze = intel_idle_freeze_and_check, }, >> { >> .name = "C3", >> .desc = "MWAIT 0x10", >> @@ -613,7 +632,7 @@ static struct cpuidle_state skl_cstates[] = { >> .exit_latency = 70, >> .target_residency = 100, >> .enter = &intel_idle, >> - .enter_freeze = intel_idle_freeze, }, >> + .enter_freeze = intel_idle_freeze_and_check, }, >> { >> .name = "C6", >> .desc = "MWAIT 0x20", >> @@ -621,7 +640,7 @@ static struct cpuidle_state skl_cstates[] = { >> .exit_latency = 85, >> .target_residency = 200, >> .enter = &intel_idle, >> - .enter_freeze = intel_idle_freeze, }, >> + .enter_freeze = intel_idle_freeze_and_check, }, >> { >> .name = "C7s", >> .desc = "MWAIT 0x33", >> @@ -629,7 +648,7 @@ static struct cpuidle_state skl_cstates[] = { >> .exit_latency = 124, >> .target_residency = 800, >> .enter = &intel_idle, >> - .enter_freeze = intel_idle_freeze, }, >> + .enter_freeze = intel_idle_freeze_and_check, }, >> { >> .name = "C8", >> .desc = "MWAIT 0x40", >> @@ -637,7 +656,7 @@ static struct cpuidle_state skl_cstates[] = { >> .exit_latency = 200, >> .target_residency = 800, >> .enter = &intel_idle, >> - .enter_freeze = intel_idle_freeze, }, >> + .enter_freeze = intel_idle_freeze_and_check, }, >> { >> .name = "C9", >> .desc = "MWAIT 0x50", >> @@ -645,7 +664,7 @@ static struct cpuidle_state skl_cstates[] = { >> .exit_latency = 480, >> .target_residency = 5000, >> .enter = &intel_idle, >> - .enter_freeze = intel_idle_freeze, }, >> + .enter_freeze = intel_idle_freeze_and_check, }, >> { >> .name = "C10", >> .desc = "MWAIT 0x60", >> @@ -653,7 +672,7 @@ static struct cpuidle_state skl_cstates[] = { >> .exit_latency = 890, >> .target_residency = 5000, >> .enter = &intel_idle, >> - .enter_freeze = intel_idle_freeze, }, >> + .enter_freeze = intel_idle_freeze_and_check, }, >> { >> .enter = NULL } >> }; >> @@ -940,6 +959,8 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev, >> * @dev: cpuidle_device >> * @drv: cpuidle driver >> * @index: state index >> + * >> + * @return 0 for success, no failure state >> */ >> static int intel_idle_freeze(struct cpuidle_device *dev, >> struct cpuidle_driver *drv, int index) >> @@ -952,6 +973,101 @@ static int intel_idle_freeze(struct cpuidle_device *dev, >> return 0; >> } >> >> +static int check_slp_s0(u32 slp_s0_saved_count) >> +{ >> + u32 slp_s0_new_count; >> + >> + if (intel_pmc_slp_s0_counter_read(&slp_s0_new_count)) { >> + pr_warn("Unable to read SLP S0 residency counter\n"); >> + return -EIO; >> + } >> + >> + if (slp_s0_saved_count == slp_s0_new_count) { >> + pr_warn("CPU did not enter SLP S0 for suspend-to-idle.\n"); >> + return -EIO; >> + } >> + >> + return 0; >> +} >> + >> +/** >> + * intel_idle_freeze_and_check - enters suspend-to-idle and validates the power >> + * state >> + * >> + * This function enters suspend-to-idle with intel_idle_freeze, but also sets up >> + * a timer to check that S0ix (low power state for suspend-to-idle on Intel >> + * CPUs) is properly entered. >> + * >> + * @dev: cpuidle_device >> + * @drv: cpuidle_driver >> + * @index: state index >> + * @return 0 for success, -EERROR if S0ix was not entered. >> + */ >> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev, >> + struct cpuidle_driver *drv, int index) >> +{ >> + bool check_on_this_cpu = false; >> + u32 slp_s0_saved_count; >> + unsigned long flags; >> + int cpu = smp_processor_id(); >> + int ret; >> + >> + /* The last CPU to freeze sets up checking SLP S0 assertion. */ >> + spin_lock_irqsave(&slp_s0_check_lock, flags); >> + slp_s0_num_cpus++; >> + if (slp_s0_seconds && >> + slp_s0_num_cpus == num_online_cpus() && >> + !slp_s0_check_inprogress && >> + !intel_pmc_slp_s0_counter_read(&slp_s0_saved_count)) { >> + ret = tick_set_freeze_event(cpu, ktime_set(slp_s0_seconds, 0)); >> + if (ret < 0) { >> + spin_unlock_irqrestore(&slp_s0_check_lock, flags); >> + goto out; >> + } >> + >> + /* >> + * Make sure check_slp_s0 isn't scheduled on another CPU if it >> + * were to leave freeze and enter it again before this CPU >> + * leaves freeze. >> + */ >> + slp_s0_check_inprogress = true; >> + check_on_this_cpu = true; >> + } >> + spin_unlock_irqrestore(&slp_s0_check_lock, flags); >> + >> + ret = intel_idle_freeze(dev, drv, index); >> + if (ret < 0) >> + goto out; >> + >> + if (check_on_this_cpu && tick_clear_freeze_event(cpu)) >> + ret = check_slp_s0(slp_s0_saved_count); >> + >> +out: >> + spin_lock_irqsave(&slp_s0_check_lock, flags); >> + if (check_on_this_cpu) { >> + slp_s0_check_inprogress = false; >> + slp_s0_seconds = min_t(unsigned int, >> + SLP_S0_EXP_BASE * slp_s0_seconds, >> + MAX_SLP_S0_SECONDS); >> + } >> + slp_s0_num_cpus--; >> + spin_unlock_irqrestore(&slp_s0_check_lock, flags); >> + return ret; >> +} >> + >> +static int slp_s0_check_prepare(struct notifier_block *nb, unsigned long action, >> + void *data) >> +{ >> + if (action == PM_SUSPEND_PREPARE) >> + slp_s0_seconds = slp_s0_check ? 1 : 0; >> + >> + return NOTIFY_DONE; >> +} >> + >> +static struct notifier_block intel_slp_s0_check_nb = { >> + .notifier_call = slp_s0_check_prepare, >> +}; >> + >> static void __setup_broadcast_timer(bool on) >> { >> if (on) >> @@ -1454,6 +1570,13 @@ static int __init intel_idle_init(void) >> goto init_driver_fail; >> } >> >> + retval = register_pm_notifier(&intel_slp_s0_check_nb); >> + if (retval) { >> + free_percpu(intel_idle_cpuidle_devices); >> + cpuidle_unregister_driver(&intel_idle_driver); >> + goto pm_nb_fail; >> + } >> + >> if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ >> lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; >> >> @@ -1469,6 +1592,8 @@ static int __init intel_idle_init(void) >> >> hp_setup_fail: >> intel_idle_cpuidle_devices_uninit(); >> + unregister_pm_notifier(&intel_slp_s0_check_nb); >> +pm_nb_fail: >> cpuidle_unregister_driver(&intel_idle_driver); >> init_driver_fail: >> free_percpu(intel_idle_cpuidle_devices); >> @@ -1484,3 +1609,4 @@ device_initcall(intel_idle_init); >> * is the easiest way (currently) to continue doing that. >> */ >> module_param(max_cstate, int, 0444); >> +module_param(slp_s0_check, bool, 0644); > > This has to be documented somehow. > > Also, if it is not set, there is a useless overhead every time > intel_idle_freeze_and_check() is called. It looks like you could use > a static key or similar to avoid that. > > Moreover, the notifier is not necessary then as well. > > Thanks, > Rafael >
On Monday, July 10, 2017 02:57:48 PM dbasehore . wrote: > On Mon, Jul 10, 2017 at 6:33 AM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote: > > On Friday, July 07, 2017 05:03:03 PM Derek Basehore wrote: > >> This adds validation of S0ix entry and enables it on Skylake. Using > >> the new tick_set_freeze_event function, we program the CPU to wake up > >> X seconds after entering freeze. After X seconds, it will wake the CPU > >> to check the S0ix residency counters and make sure we entered the > >> lowest power state for suspend-to-idle. > >> > >> It exits freeze and reports an error to userspace when the SoC does > >> not enter S0ix on suspend-to-idle. > >> > > > > Honestly, I'm totally unsure about this ATM, as it seems to assume that it > > doesn't make senes to stay suspended if SLP_S0 residency is not there, but > > that totally may not be the case. > > > > First of all, there are systems in which SLP_S0 is related to about 10%-20% of > > the total power draw reduction whereas the remaining 80%-90% comes from PC10 > > alone. So, if you can get to PC10, being unable to get SLP_S0 residency on top > > of that may not be a big deal after all. Of course, you may argue that 10%-20% > > of battery life while suspended is "a lot", but that really depends on the > > possible alternatives. > > > > We'd have to track actual PC10 residency instead of checking if it's > the requested state since the SoC can enter a higher power package > cstate even if PC10 is requested. That's correct, but it should be sufficient to check the PC10 residency (there's some code to do that in turbostat, for example). > I think this can be done by reading > an msr register, though. Is there an example of how PC10 can be > entered without SLP_S0 getting asserted by the way? Yes, there is. PC10 is a power state of the north complex and it can be entered regardless of the SLP_S0 status which is related to the south complex. > Also, this feature is disabled by default, so it doesn't prevent these > use cases. > > > Second, as far as the alternatives go, it may not be rosy, because there are > > systems that don't support S3 (or any other ACPI sleep states at all for that > > matter) and suspend-to-idle is the only suspend mechanism available there. > > On those systems it still may make sense to use it even though it may not > > reduce the power draw that much. And from some experiments, suspend-to-idle > > still extends battery life by 100% over runtime idle even if the system is not > > able to get to PC10 most of the time. > > This is off by default. Fair enough, but even so it may not be very useful in general as is. > > > > While I understand the use case, I don't think it is a binary "yes"-"no" thing > > and the focus on just SLP_S0 may be misguided. > > Do you have a preference such as being able to set the level that you > want to validate to? For instance, there could be an option to check > that SLP_So is asserted, but there could also be an option to check > for PC9 or PC10 residency. For instance, there could be a module > parameters for setting the validated state: > > available_suspend_to_idle_states: > "none pc6 pc9 pc10 slp_s0" > > max_suspend_to_idle_state: > "none" > > Where the default validated state is none, but it can be set to any of > the states in available_suspend_to_idle_states In the suspend-to-idle path the driver will always request the deepest state available (C10 for Skylake) and I would validate the associated package state by default plus optionally SLP_S0. > > > >> One example of a bug that can prevent a Skylake CPU from entering S0ix > >> (suspend-to-idle) is a leaked reference count to one of the i915 power > > > > Suspend-to-idle is not S0ix. Suspend-to-idle is the state the kernel puts the > > system into after echoing "freeze" to /sys/power/state and S0ix is a platform > > power state that may or may not be entered as a result of that. > > > >> wells. The CPU will not be able to enter Package C10 and will > >> therefore use about 4x as much power for the entire system. The issue > >> is not specific to the i915 power wells though. > > > > Well, fair enough, but what if the SoC can enter PC10, but without SLP_S0 > > residency on top of it? > > > >> Signed-off-by: Derek Basehore <dbasehore@chromium.org> > >> --- > >> drivers/idle/intel_idle.c | 142 +++++++++++++++++++++++++++++++++++++++++++--- > >> 1 file changed, 134 insertions(+), 8 deletions(-) > >> > >> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c > >> index ebed3f804291..d38621da6e54 100644 > >> --- a/drivers/idle/intel_idle.c > >> +++ b/drivers/idle/intel_idle.c > >> @@ -61,10 +61,12 @@ > >> #include <linux/notifier.h> > >> #include <linux/cpu.h> > >> #include <linux/moduleparam.h> > >> +#include <linux/suspend.h> > >> #include <asm/cpu_device_id.h> > >> #include <asm/intel-family.h> > >> #include <asm/mwait.h> > >> #include <asm/msr.h> > >> +#include <asm/pmc_core.h> > >> > >> #define INTEL_IDLE_VERSION "0.4.1" > >> > >> @@ -93,12 +95,29 @@ struct idle_cpu { > >> bool disable_promotion_to_c1e; > >> }; > >> > >> +/* > >> + * The limit for the exponential backoff for the freeze duration. At this point, > >> + * power impact is is far from measurable. It's about 3uW based on scaling from > >> + * waking up 10 times a second. > >> + */ > >> +#define MAX_SLP_S0_SECONDS 1000 > >> +#define SLP_S0_EXP_BASE 10 > >> + > >> +static bool slp_s0_check; > >> +static unsigned int slp_s0_seconds; > >> + > >> +static DEFINE_SPINLOCK(slp_s0_check_lock); > >> +static unsigned int slp_s0_num_cpus; > >> +static bool slp_s0_check_inprogress; > >> + > >> static const struct idle_cpu *icpu; > >> static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; > >> static int intel_idle(struct cpuidle_device *dev, > >> struct cpuidle_driver *drv, int index); > >> static int intel_idle_freeze(struct cpuidle_device *dev, > >> struct cpuidle_driver *drv, int index); > >> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev, > >> + struct cpuidle_driver *drv, int index); > >> static struct cpuidle_state *cpuidle_state_table; > >> > >> /* > >> @@ -597,7 +616,7 @@ static struct cpuidle_state skl_cstates[] = { > >> .exit_latency = 2, > >> .target_residency = 2, > >> .enter = &intel_idle, > >> - .enter_freeze = intel_idle_freeze, }, > >> + .enter_freeze = intel_idle_freeze_and_check, }, > > > > Why do you do this for anything lower than C6? > > In that case, it's probably best the fail in these cases if the check > is enabled. The CPU can't enter a lower cstate than the hinted one, > correct? Yes, it can, but this is based on the hint and not on the entered state. :-) There is some gray area related to what if the user disabled the deepest state via sysfs, but other than that the check only needs to be made in the deepest state's callback (because that's what will be requested in the suspend-to-idle path). Thanks, Rafael
On Mon, Jul 10, 2017 at 3:09 PM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote: > On Monday, July 10, 2017 02:57:48 PM dbasehore . wrote: >> On Mon, Jul 10, 2017 at 6:33 AM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote: >> > On Friday, July 07, 2017 05:03:03 PM Derek Basehore wrote: >> >> This adds validation of S0ix entry and enables it on Skylake. Using >> >> the new tick_set_freeze_event function, we program the CPU to wake up >> >> X seconds after entering freeze. After X seconds, it will wake the CPU >> >> to check the S0ix residency counters and make sure we entered the >> >> lowest power state for suspend-to-idle. >> >> >> >> It exits freeze and reports an error to userspace when the SoC does >> >> not enter S0ix on suspend-to-idle. >> >> >> > >> > Honestly, I'm totally unsure about this ATM, as it seems to assume that it >> > doesn't make senes to stay suspended if SLP_S0 residency is not there, but >> > that totally may not be the case. >> > >> > First of all, there are systems in which SLP_S0 is related to about 10%-20% of >> > the total power draw reduction whereas the remaining 80%-90% comes from PC10 >> > alone. So, if you can get to PC10, being unable to get SLP_S0 residency on top >> > of that may not be a big deal after all. Of course, you may argue that 10%-20% >> > of battery life while suspended is "a lot", but that really depends on the >> > possible alternatives. >> > >> >> We'd have to track actual PC10 residency instead of checking if it's >> the requested state since the SoC can enter a higher power package >> cstate even if PC10 is requested. > > That's correct, but it should be sufficient to check the PC10 residency > (there's some code to do that in turbostat, for example). > >> I think this can be done by reading >> an msr register, though. Is there an example of how PC10 can be >> entered without SLP_S0 getting asserted by the way? > > Yes, there is. > > PC10 is a power state of the north complex and it can be entered regardless > of the SLP_S0 status which is related to the south complex. > >> Also, this feature is disabled by default, so it doesn't prevent these >> use cases. >> >> > Second, as far as the alternatives go, it may not be rosy, because there are >> > systems that don't support S3 (or any other ACPI sleep states at all for that >> > matter) and suspend-to-idle is the only suspend mechanism available there. >> > On those systems it still may make sense to use it even though it may not >> > reduce the power draw that much. And from some experiments, suspend-to-idle >> > still extends battery life by 100% over runtime idle even if the system is not >> > able to get to PC10 most of the time. >> >> This is off by default. > > Fair enough, but even so it may not be very useful in general as is. > >> > >> > While I understand the use case, I don't think it is a binary "yes"-"no" thing >> > and the focus on just SLP_S0 may be misguided. >> >> Do you have a preference such as being able to set the level that you >> want to validate to? For instance, there could be an option to check >> that SLP_So is asserted, but there could also be an option to check >> for PC9 or PC10 residency. For instance, there could be a module >> parameters for setting the validated state: >> >> available_suspend_to_idle_states: >> "none pc6 pc9 pc10 slp_s0" >> >> max_suspend_to_idle_state: >> "none" >> >> Where the default validated state is none, but it can be set to any of >> the states in available_suspend_to_idle_states > > In the suspend-to-idle path the driver will always request the deepest state > available (C10 for Skylake) and I would validate the associated package state > by default plus optionally SLP_S0. Should package state validation be enabled by default and should the user be able to disable it? > >> > >> >> One example of a bug that can prevent a Skylake CPU from entering S0ix >> >> (suspend-to-idle) is a leaked reference count to one of the i915 power >> > >> > Suspend-to-idle is not S0ix. Suspend-to-idle is the state the kernel puts the >> > system into after echoing "freeze" to /sys/power/state and S0ix is a platform >> > power state that may or may not be entered as a result of that. >> > >> >> wells. The CPU will not be able to enter Package C10 and will >> >> therefore use about 4x as much power for the entire system. The issue >> >> is not specific to the i915 power wells though. >> > >> > Well, fair enough, but what if the SoC can enter PC10, but without SLP_S0 >> > residency on top of it? >> > >> >> Signed-off-by: Derek Basehore <dbasehore@chromium.org> >> >> --- >> >> drivers/idle/intel_idle.c | 142 +++++++++++++++++++++++++++++++++++++++++++--- >> >> 1 file changed, 134 insertions(+), 8 deletions(-) >> >> >> >> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c >> >> index ebed3f804291..d38621da6e54 100644 >> >> --- a/drivers/idle/intel_idle.c >> >> +++ b/drivers/idle/intel_idle.c >> >> @@ -61,10 +61,12 @@ >> >> #include <linux/notifier.h> >> >> #include <linux/cpu.h> >> >> #include <linux/moduleparam.h> >> >> +#include <linux/suspend.h> >> >> #include <asm/cpu_device_id.h> >> >> #include <asm/intel-family.h> >> >> #include <asm/mwait.h> >> >> #include <asm/msr.h> >> >> +#include <asm/pmc_core.h> >> >> >> >> #define INTEL_IDLE_VERSION "0.4.1" >> >> >> >> @@ -93,12 +95,29 @@ struct idle_cpu { >> >> bool disable_promotion_to_c1e; >> >> }; >> >> >> >> +/* >> >> + * The limit for the exponential backoff for the freeze duration. At this point, >> >> + * power impact is is far from measurable. It's about 3uW based on scaling from >> >> + * waking up 10 times a second. >> >> + */ >> >> +#define MAX_SLP_S0_SECONDS 1000 >> >> +#define SLP_S0_EXP_BASE 10 >> >> + >> >> +static bool slp_s0_check; >> >> +static unsigned int slp_s0_seconds; >> >> + >> >> +static DEFINE_SPINLOCK(slp_s0_check_lock); >> >> +static unsigned int slp_s0_num_cpus; >> >> +static bool slp_s0_check_inprogress; >> >> + >> >> static const struct idle_cpu *icpu; >> >> static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; >> >> static int intel_idle(struct cpuidle_device *dev, >> >> struct cpuidle_driver *drv, int index); >> >> static int intel_idle_freeze(struct cpuidle_device *dev, >> >> struct cpuidle_driver *drv, int index); >> >> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev, >> >> + struct cpuidle_driver *drv, int index); >> >> static struct cpuidle_state *cpuidle_state_table; >> >> >> >> /* >> >> @@ -597,7 +616,7 @@ static struct cpuidle_state skl_cstates[] = { >> >> .exit_latency = 2, >> >> .target_residency = 2, >> >> .enter = &intel_idle, >> >> - .enter_freeze = intel_idle_freeze, }, >> >> + .enter_freeze = intel_idle_freeze_and_check, }, >> > >> > Why do you do this for anything lower than C6? >> >> In that case, it's probably best the fail in these cases if the check >> is enabled. The CPU can't enter a lower cstate than the hinted one, >> correct? > > Yes, it can, but this is based on the hint and not on the entered state. :-) > > There is some gray area related to what if the user disabled the deepest state > via sysfs, but other than that the check only needs to be made in the deepest > state's callback (because that's what will be requested in the suspend-to-idle > path). > > Thanks, > Rafael >
On Monday, July 10, 2017 03:24:14 PM dbasehore . wrote: > On Mon, Jul 10, 2017 at 3:09 PM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote: > > On Monday, July 10, 2017 02:57:48 PM dbasehore . wrote: > >> On Mon, Jul 10, 2017 at 6:33 AM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote: > >> > On Friday, July 07, 2017 05:03:03 PM Derek Basehore wrote: > >> >> This adds validation of S0ix entry and enables it on Skylake. Using > >> >> the new tick_set_freeze_event function, we program the CPU to wake up > >> >> X seconds after entering freeze. After X seconds, it will wake the CPU > >> >> to check the S0ix residency counters and make sure we entered the > >> >> lowest power state for suspend-to-idle. > >> >> > >> >> It exits freeze and reports an error to userspace when the SoC does > >> >> not enter S0ix on suspend-to-idle. > >> >> > >> > > >> > Honestly, I'm totally unsure about this ATM, as it seems to assume that it > >> > doesn't make senes to stay suspended if SLP_S0 residency is not there, but > >> > that totally may not be the case. > >> > > >> > First of all, there are systems in which SLP_S0 is related to about 10%-20% of > >> > the total power draw reduction whereas the remaining 80%-90% comes from PC10 > >> > alone. So, if you can get to PC10, being unable to get SLP_S0 residency on top > >> > of that may not be a big deal after all. Of course, you may argue that 10%-20% > >> > of battery life while suspended is "a lot", but that really depends on the > >> > possible alternatives. > >> > > >> > >> We'd have to track actual PC10 residency instead of checking if it's > >> the requested state since the SoC can enter a higher power package > >> cstate even if PC10 is requested. > > > > That's correct, but it should be sufficient to check the PC10 residency > > (there's some code to do that in turbostat, for example). > > > >> I think this can be done by reading > >> an msr register, though. Is there an example of how PC10 can be > >> entered without SLP_S0 getting asserted by the way? > > > > Yes, there is. > > > > PC10 is a power state of the north complex and it can be entered regardless > > of the SLP_S0 status which is related to the south complex. > > > >> Also, this feature is disabled by default, so it doesn't prevent these > >> use cases. > >> > >> > Second, as far as the alternatives go, it may not be rosy, because there are > >> > systems that don't support S3 (or any other ACPI sleep states at all for that > >> > matter) and suspend-to-idle is the only suspend mechanism available there. > >> > On those systems it still may make sense to use it even though it may not > >> > reduce the power draw that much. And from some experiments, suspend-to-idle > >> > still extends battery life by 100% over runtime idle even if the system is not > >> > able to get to PC10 most of the time. > >> > >> This is off by default. > > > > Fair enough, but even so it may not be very useful in general as is. > > > >> > > >> > While I understand the use case, I don't think it is a binary "yes"-"no" thing > >> > and the focus on just SLP_S0 may be misguided. > >> > >> Do you have a preference such as being able to set the level that you > >> want to validate to? For instance, there could be an option to check > >> that SLP_So is asserted, but there could also be an option to check > >> for PC9 or PC10 residency. For instance, there could be a module > >> parameters for setting the validated state: > >> > >> available_suspend_to_idle_states: > >> "none pc6 pc9 pc10 slp_s0" > >> > >> max_suspend_to_idle_state: > >> "none" > >> > >> Where the default validated state is none, but it can be set to any of > >> the states in available_suspend_to_idle_states > > > > In the suspend-to-idle path the driver will always request the deepest state > > available (C10 for Skylake) and I would validate the associated package state > > by default plus optionally SLP_S0. > > Should package state validation be enabled by default and should the > user be able to disable it? IMO the whole vaildation should depend on a command line option (in case this is a system without S3 and the user has no choice really), but it should check the package state residency in the first place. I guess this means I would make it a "bail out if you can't go as deep as X" with X possibly equal to "the deepest package state" or "SLP_S0". Thanks, Rafael
I acknowledge the specific need for this check to assure a great user-experience on specific hardware. I also concur the motivation to make mechanisms general and generic so they can be re-used. However, it isn't clear to me that this check would be used outside of some very specific scenarios, and so we may be trying too hard to make it general, and the code would be simpler if we focus on that. We can always make it more general when we have more use-cases...
On Fri, 7 Jul 2017, Derek Basehore wrote: > This adds validation of S0ix entry and enables it on Skylake. Using > the new tick_set_freeze_event function, we program the CPU to wake up > X seconds after entering freeze. After X seconds, it will wake the CPU > to check the S0ix residency counters and make sure we entered the > lowest power state for suspend-to-idle. > > It exits freeze and reports an error to userspace when the SoC does > not enter S0ix on suspend-to-idle. > > One example of a bug that can prevent a Skylake CPU from entering S0ix > (suspend-to-idle) is a leaked reference count to one of the i915 power > wells. The CPU will not be able to enter Package C10 and will > therefore use about 4x as much power for the entire system. The issue > is not specific to the i915 power wells though. I really have a hard time to understand the usefulness of this. All I can see so far is detecting that something went wrong. So if this happens once per day all the user gets is a completely useless line in dmesg. Even if that message comes more often, it still tells only that something went wrong without the slightest information about the potential root cause. There are more issues with this: If there is a hrtimer scheduled on that last CPU which enters the idle freeze state and that timer is 10 minutes away, then the check timer can't be programmed and the system will happily stay for 10 minutes in some shallow C state without notice. Not really useful. You know upfront whether the i915 power wells (or whatever other machinery) is not powered off to allow the system to enter a specific power state. If you think hard enough about creating infrastructure which allows you to register power related facilities and then check them in that idle freeze enter state, then you get immediate information WHY this happens and not just the by chance notification about the fact that it happened. I might be missing something, but aside of the issues I have with the tick/clockevents abuse, this looks like some half baken ad hoc debugging aid of dubious value. Thanks, tglx
On Wed, Jul 12, 2017 at 3:16 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > On Fri, 7 Jul 2017, Derek Basehore wrote: >> This adds validation of S0ix entry and enables it on Skylake. Using >> the new tick_set_freeze_event function, we program the CPU to wake up >> X seconds after entering freeze. After X seconds, it will wake the CPU >> to check the S0ix residency counters and make sure we entered the >> lowest power state for suspend-to-idle. >> >> It exits freeze and reports an error to userspace when the SoC does >> not enter S0ix on suspend-to-idle. >> >> One example of a bug that can prevent a Skylake CPU from entering S0ix >> (suspend-to-idle) is a leaked reference count to one of the i915 power >> wells. The CPU will not be able to enter Package C10 and will >> therefore use about 4x as much power for the entire system. The issue >> is not specific to the i915 power wells though. > > I really have a hard time to understand the usefulness of this. > > All I can see so far is detecting that something went wrong. So if this > happens once per day all the user gets is a completely useless line in > dmesg. Even if that message comes more often, it still tells only that > something went wrong without the slightest information about the potential > root cause. > > There are more issues with this: If there is a hrtimer scheduled on that > last CPU which enters the idle freeze state and that timer is 10 minutes > away, then the check timer can't be programmed and the system will happily > stay for 10 minutes in some shallow C state without notice. Not really > useful. Are hrtimers not suspended after timekeeping_suspend is called? > > You know upfront whether the i915 power wells (or whatever other machinery) > is not powered off to allow the system to enter a specific power state. If > you think hard enough about creating infrastructure which allows you to > register power related facilities and then check them in that idle freeze > enter state, then you get immediate information WHY this happens and not > just the by chance notification about the fact that it happened. It's not always something that can be checked by software. There was one case where an ordering for powering down audio hardware prevented proper PC10 entry, but there didn't seem to be any way to check that. Hardware watchdogs also have the same lack of clarity, but most if not all desktop and mobile processors ship with one. Overall, this seems to be the best that can be done at this point in freeze, and we can't really rely on every part of the system properly validating it's state in its suspend operation. > > I might be missing something, but aside of the issues I have with the > tick/clockevents abuse, this looks like some half baken ad hoc debugging > aid of dubious value. > > Thanks, > > tglx > > > >
On Wed, Jul 12, 2017 at 3:16 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > On Fri, 7 Jul 2017, Derek Basehore wrote: >> This adds validation of S0ix entry and enables it on Skylake. Using >> the new tick_set_freeze_event function, we program the CPU to wake up >> X seconds after entering freeze. After X seconds, it will wake the CPU >> to check the S0ix residency counters and make sure we entered the >> lowest power state for suspend-to-idle. >> >> It exits freeze and reports an error to userspace when the SoC does >> not enter S0ix on suspend-to-idle. >> >> One example of a bug that can prevent a Skylake CPU from entering S0ix >> (suspend-to-idle) is a leaked reference count to one of the i915 power >> wells. The CPU will not be able to enter Package C10 and will >> therefore use about 4x as much power for the entire system. The issue >> is not specific to the i915 power wells though. > > I really have a hard time to understand the usefulness of this. > > All I can see so far is detecting that something went wrong. So if this > happens once per day all the user gets is a completely useless line in > dmesg. Even if that message comes more often, it still tells only that > something went wrong without the slightest information about the potential > root cause. > > There are more issues with this: If there is a hrtimer scheduled on that > last CPU which enters the idle freeze state and that timer is 10 minutes > away, then the check timer can't be programmed and the system will happily > stay for 10 minutes in some shallow C state without notice. Not really > useful. > > You know upfront whether the i915 power wells (or whatever other machinery) > is not powered off to allow the system to enter a specific power state. If > you think hard enough about creating infrastructure which allows you to > register power related facilities and then check them in that idle freeze > enter state, then you get immediate information WHY this happens and not > just the by chance notification about the fact that it happened. > > I might be missing something, but aside of the issues I have with the > tick/clockevents abuse, this looks like some half baken ad hoc debugging > aid of dubious value. This isn't just for debugging. If we just wanted it for debugging, we could read the slp s0 counter on resume. This is also to fail suspend to idle fast so it can be tried again (or just shutdown the machine) when a user closes the machine for the weekend. > > Thanks, > > tglx > > > >
On Wed, 12 Jul 2017, dbasehore . wrote: > On Wed, Jul 12, 2017 at 3:16 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > > There are more issues with this: If there is a hrtimer scheduled on that > > last CPU which enters the idle freeze state and that timer is 10 minutes > > away, then the check timer can't be programmed and the system will happily > > stay for 10 minutes in some shallow C state without notice. Not really > > useful. > > Are hrtimers not suspended after timekeeping_suspend is called? They are. As I said I forgot about the inner workings and that check for state != shutdown confused me even more, as it just looked like this might be a valid state. > > You know upfront whether the i915 power wells (or whatever other machinery) > > is not powered off to allow the system to enter a specific power state. If > > you think hard enough about creating infrastructure which allows you to > > register power related facilities and then check them in that idle freeze > > enter state, then you get immediate information WHY this happens and not > > just the by chance notification about the fact that it happened. > > It's not always something that can be checked by software. There was > one case where an ordering for powering down audio hardware prevented > proper PC10 entry, but there didn't seem to be any way to check that. > Hardware watchdogs also have the same lack of clarity, but most if not > all desktop and mobile processors ship with one. Overall, this seems > to be the best that can be done at this point in freeze, and we can't > really rely on every part of the system properly validating it's state > in its suspend operation. So if I understand correctly, this is the last resort of catching problems which can't be detected upfront or are caused by a software bug. I'm fine with that, but please explain and document it proper. The current explanation is confusing at best. Thanks, tglx
On Wed, Jul 12, 2017 at 10:11 PM, Thomas Gleixner <tglx@linutronix.de> wrote: > On Wed, 12 Jul 2017, dbasehore . wrote: >> On Wed, Jul 12, 2017 at 3:16 PM, Thomas Gleixner <tglx@linutronix.de> wrote: >> > There are more issues with this: If there is a hrtimer scheduled on that >> > last CPU which enters the idle freeze state and that timer is 10 minutes >> > away, then the check timer can't be programmed and the system will happily >> > stay for 10 minutes in some shallow C state without notice. Not really >> > useful. >> >> Are hrtimers not suspended after timekeeping_suspend is called? > > They are. As I said I forgot about the inner workings and that check for > state != shutdown confused me even more, as it just looked like this might > be a valid state. Okay. I'll add a comment to clarify this part. > >> > You know upfront whether the i915 power wells (or whatever other machinery) >> > is not powered off to allow the system to enter a specific power state. If >> > you think hard enough about creating infrastructure which allows you to >> > register power related facilities and then check them in that idle freeze >> > enter state, then you get immediate information WHY this happens and not >> > just the by chance notification about the fact that it happened. >> >> It's not always something that can be checked by software. There was >> one case where an ordering for powering down audio hardware prevented >> proper PC10 entry, but there didn't seem to be any way to check that. >> Hardware watchdogs also have the same lack of clarity, but most if not >> all desktop and mobile processors ship with one. Overall, this seems >> to be the best that can be done at this point in freeze, and we can't >> really rely on every part of the system properly validating it's state >> in its suspend operation. > > So if I understand correctly, this is the last resort of catching problems > which can't be detected upfront or are caused by a software bug. > > I'm fine with that, but please explain and document it proper. The current > explanation is confusing at best. Will do. > > Thanks, > > tglx
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index ebed3f804291..d38621da6e54 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -61,10 +61,12 @@ #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/moduleparam.h> +#include <linux/suspend.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include <asm/mwait.h> #include <asm/msr.h> +#include <asm/pmc_core.h> #define INTEL_IDLE_VERSION "0.4.1" @@ -93,12 +95,29 @@ struct idle_cpu { bool disable_promotion_to_c1e; }; +/* + * The limit for the exponential backoff for the freeze duration. At this point, + * power impact is is far from measurable. It's about 3uW based on scaling from + * waking up 10 times a second. + */ +#define MAX_SLP_S0_SECONDS 1000 +#define SLP_S0_EXP_BASE 10 + +static bool slp_s0_check; +static unsigned int slp_s0_seconds; + +static DEFINE_SPINLOCK(slp_s0_check_lock); +static unsigned int slp_s0_num_cpus; +static bool slp_s0_check_inprogress; + static const struct idle_cpu *icpu; static struct cpuidle_device __percpu *intel_idle_cpuidle_devices; static int intel_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); static int intel_idle_freeze(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); +static int intel_idle_freeze_and_check(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index); static struct cpuidle_state *cpuidle_state_table; /* @@ -597,7 +616,7 @@ static struct cpuidle_state skl_cstates[] = { .exit_latency = 2, .target_residency = 2, .enter = &intel_idle, - .enter_freeze = intel_idle_freeze, }, + .enter_freeze = intel_idle_freeze_and_check, }, { .name = "C1E", .desc = "MWAIT 0x01", @@ -605,7 +624,7 @@ static struct cpuidle_state skl_cstates[] = { .exit_latency = 10, .target_residency = 20, .enter = &intel_idle, - .enter_freeze = intel_idle_freeze, }, + .enter_freeze = intel_idle_freeze_and_check, }, { .name = "C3", .desc = "MWAIT 0x10", @@ -613,7 +632,7 @@ static struct cpuidle_state skl_cstates[] = { .exit_latency = 70, .target_residency = 100, .enter = &intel_idle, - .enter_freeze = intel_idle_freeze, }, + .enter_freeze = intel_idle_freeze_and_check, }, { .name = "C6", .desc = "MWAIT 0x20", @@ -621,7 +640,7 @@ static struct cpuidle_state skl_cstates[] = { .exit_latency = 85, .target_residency = 200, .enter = &intel_idle, - .enter_freeze = intel_idle_freeze, }, + .enter_freeze = intel_idle_freeze_and_check, }, { .name = "C7s", .desc = "MWAIT 0x33", @@ -629,7 +648,7 @@ static struct cpuidle_state skl_cstates[] = { .exit_latency = 124, .target_residency = 800, .enter = &intel_idle, - .enter_freeze = intel_idle_freeze, }, + .enter_freeze = intel_idle_freeze_and_check, }, { .name = "C8", .desc = "MWAIT 0x40", @@ -637,7 +656,7 @@ static struct cpuidle_state skl_cstates[] = { .exit_latency = 200, .target_residency = 800, .enter = &intel_idle, - .enter_freeze = intel_idle_freeze, }, + .enter_freeze = intel_idle_freeze_and_check, }, { .name = "C9", .desc = "MWAIT 0x50", @@ -645,7 +664,7 @@ static struct cpuidle_state skl_cstates[] = { .exit_latency = 480, .target_residency = 5000, .enter = &intel_idle, - .enter_freeze = intel_idle_freeze, }, + .enter_freeze = intel_idle_freeze_and_check, }, { .name = "C10", .desc = "MWAIT 0x60", @@ -653,7 +672,7 @@ static struct cpuidle_state skl_cstates[] = { .exit_latency = 890, .target_residency = 5000, .enter = &intel_idle, - .enter_freeze = intel_idle_freeze, }, + .enter_freeze = intel_idle_freeze_and_check, }, { .enter = NULL } }; @@ -940,6 +959,8 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev, * @dev: cpuidle_device * @drv: cpuidle driver * @index: state index + * + * @return 0 for success, no failure state */ static int intel_idle_freeze(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) @@ -952,6 +973,101 @@ static int intel_idle_freeze(struct cpuidle_device *dev, return 0; } +static int check_slp_s0(u32 slp_s0_saved_count) +{ + u32 slp_s0_new_count; + + if (intel_pmc_slp_s0_counter_read(&slp_s0_new_count)) { + pr_warn("Unable to read SLP S0 residency counter\n"); + return -EIO; + } + + if (slp_s0_saved_count == slp_s0_new_count) { + pr_warn("CPU did not enter SLP S0 for suspend-to-idle.\n"); + return -EIO; + } + + return 0; +} + +/** + * intel_idle_freeze_and_check - enters suspend-to-idle and validates the power + * state + * + * This function enters suspend-to-idle with intel_idle_freeze, but also sets up + * a timer to check that S0ix (low power state for suspend-to-idle on Intel + * CPUs) is properly entered. + * + * @dev: cpuidle_device + * @drv: cpuidle_driver + * @index: state index + * @return 0 for success, -EERROR if S0ix was not entered. + */ +static int intel_idle_freeze_and_check(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) +{ + bool check_on_this_cpu = false; + u32 slp_s0_saved_count; + unsigned long flags; + int cpu = smp_processor_id(); + int ret; + + /* The last CPU to freeze sets up checking SLP S0 assertion. */ + spin_lock_irqsave(&slp_s0_check_lock, flags); + slp_s0_num_cpus++; + if (slp_s0_seconds && + slp_s0_num_cpus == num_online_cpus() && + !slp_s0_check_inprogress && + !intel_pmc_slp_s0_counter_read(&slp_s0_saved_count)) { + ret = tick_set_freeze_event(cpu, ktime_set(slp_s0_seconds, 0)); + if (ret < 0) { + spin_unlock_irqrestore(&slp_s0_check_lock, flags); + goto out; + } + + /* + * Make sure check_slp_s0 isn't scheduled on another CPU if it + * were to leave freeze and enter it again before this CPU + * leaves freeze. + */ + slp_s0_check_inprogress = true; + check_on_this_cpu = true; + } + spin_unlock_irqrestore(&slp_s0_check_lock, flags); + + ret = intel_idle_freeze(dev, drv, index); + if (ret < 0) + goto out; + + if (check_on_this_cpu && tick_clear_freeze_event(cpu)) + ret = check_slp_s0(slp_s0_saved_count); + +out: + spin_lock_irqsave(&slp_s0_check_lock, flags); + if (check_on_this_cpu) { + slp_s0_check_inprogress = false; + slp_s0_seconds = min_t(unsigned int, + SLP_S0_EXP_BASE * slp_s0_seconds, + MAX_SLP_S0_SECONDS); + } + slp_s0_num_cpus--; + spin_unlock_irqrestore(&slp_s0_check_lock, flags); + return ret; +} + +static int slp_s0_check_prepare(struct notifier_block *nb, unsigned long action, + void *data) +{ + if (action == PM_SUSPEND_PREPARE) + slp_s0_seconds = slp_s0_check ? 1 : 0; + + return NOTIFY_DONE; +} + +static struct notifier_block intel_slp_s0_check_nb = { + .notifier_call = slp_s0_check_prepare, +}; + static void __setup_broadcast_timer(bool on) { if (on) @@ -1454,6 +1570,13 @@ static int __init intel_idle_init(void) goto init_driver_fail; } + retval = register_pm_notifier(&intel_slp_s0_check_nb); + if (retval) { + free_percpu(intel_idle_cpuidle_devices); + cpuidle_unregister_driver(&intel_idle_driver); + goto pm_nb_fail; + } + if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; @@ -1469,6 +1592,8 @@ static int __init intel_idle_init(void) hp_setup_fail: intel_idle_cpuidle_devices_uninit(); + unregister_pm_notifier(&intel_slp_s0_check_nb); +pm_nb_fail: cpuidle_unregister_driver(&intel_idle_driver); init_driver_fail: free_percpu(intel_idle_cpuidle_devices); @@ -1484,3 +1609,4 @@ device_initcall(intel_idle_init); * is the easiest way (currently) to continue doing that. */ module_param(max_cstate, int, 0444); +module_param(slp_s0_check, bool, 0644);
This adds validation of S0ix entry and enables it on Skylake. Using the new tick_set_freeze_event function, we program the CPU to wake up X seconds after entering freeze. After X seconds, it will wake the CPU to check the S0ix residency counters and make sure we entered the lowest power state for suspend-to-idle. It exits freeze and reports an error to userspace when the SoC does not enter S0ix on suspend-to-idle. One example of a bug that can prevent a Skylake CPU from entering S0ix (suspend-to-idle) is a leaked reference count to one of the i915 power wells. The CPU will not be able to enter Package C10 and will therefore use about 4x as much power for the entire system. The issue is not specific to the i915 power wells though. Signed-off-by: Derek Basehore <dbasehore@chromium.org> --- drivers/idle/intel_idle.c | 142 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 134 insertions(+), 8 deletions(-)