diff mbox series

[v3,1/3] x86/smp: Allow calling mwait_play_dead with arbitrary hint

Message ID 20241108122909.763663-2-patryk.wlazlyn@linux.intel.com (mailing list archive)
State Changes Requested, archived
Headers show
Series SRF: Fix offline CPU preventing pc6 entry | expand

Commit Message

Patryk Wlazlyn Nov. 8, 2024, 12:29 p.m. UTC
The current implementation for looking up the mwait hint for the deepest
cstate depends on them to be continuous in range [0, NUM_SUBSTATES-1].
While that is correct on most Intel x86 platforms, it is not documented
and may not result in reaching the most optimized idle state on some of
them.

For example Intel's Sierra Forest report two C6 substates in cpuid leaf 5:
    C6S  (hint 0x22)
    C6SP (hint 0x23)

Hints 0x20 and 0x21 are skipped entirely, causing the current
implementation to compute the wrong hint, when looking for the deepest
cstate for offlined CPU to enter. As a result, package with an offlined
CPU can never reach PC6.

Allow the idle driver to call mwait_play_dead() code with the forced
mwait hint, skipping the cpuid based computation.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
---
 arch/x86/include/asm/smp.h |  6 ++++++
 arch/x86/kernel/smpboot.c  | 25 ++++++++++++++++++-------
 2 files changed, 24 insertions(+), 7 deletions(-)

Comments

Dave Hansen Nov. 8, 2024, 4:03 p.m. UTC | #1
On 11/8/24 04:29, Patryk Wlazlyn wrote:
> The current implementation for looking up the mwait hint for the deepest
> cstate depends on them to be continuous in range [0, NUM_SUBSTATES-1].
> While that is correct on most Intel x86 platforms, it is not documented
> and may not result in reaching the most optimized idle state on some of
> them.

First, it is not clear what code this refers to.  It needs to be more clear.

Second, this is not clear about the bug.  Start with being crystal clear
about the problem and the impact:

	MWAIT needs different hints on different CPUs to reach the most
	efficient idle states. The hint calculation* works in practice
	on current hardware, but it fails on newer ones. Those newer
	CPUs' battery life really suffers when the system is suspended
	like when a laptop lid is closed.

	 * The current calculation is the for loop inspecting edx in
	   mwait_play_dead()

Then you can go into detail about what the bad implementation does.

> For example Intel's Sierra Forest report two C6 substates in cpuid leaf 5:
>     C6S  (hint 0x22)
>     C6SP (hint 0x23)
> 
> Hints 0x20 and 0x21 are skipped entirely,

Why does it start at 0x20?  Giving the example on new, broken hardware
is also a bit weak without an example from old, fully functional hardware.

> causing the current
> implementation to compute the wrong hint, when looking for the deepest
> cstate for offlined CPU to enter. As a result, package with an offlined
> CPU can never reach PC6.

I don't think the PC6 detail matters enough to mention here.

> Allow the idle driver to call mwait_play_dead() code with the forced
> mwait hint, skipping the cpuid based computation.
> 
> Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
> ---
>  arch/x86/include/asm/smp.h |  6 ++++++
>  arch/x86/kernel/smpboot.c  | 25 ++++++++++++++++++-------
>  2 files changed, 24 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
> index ca073f40698f..fbd275d6661a 100644
> --- a/arch/x86/include/asm/smp.h
> +++ b/arch/x86/include/asm/smp.h
> @@ -114,6 +114,7 @@ void wbinvd_on_cpu(int cpu);
>  int wbinvd_on_all_cpus(void);
>  
>  void smp_kick_mwait_play_dead(void);
> +int mwait_play_dead_with_hint(unsigned long hint);
>  
>  void native_smp_send_reschedule(int cpu);
>  void native_send_call_func_ipi(const struct cpumask *mask);
> @@ -164,6 +165,11 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
>  {
>  	return (struct cpumask *)cpumask_of(0);
>  }
> +
> +static inline int mwait_play_dead_with_hint(unsigned long eax_hint)
> +{
> +	return 1;
> +}
>  #endif /* CONFIG_SMP */
>  
>  #ifdef CONFIG_DEBUG_NMI_SELFTEST
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 0c35207320cb..44c40781bad6 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -1270,13 +1270,14 @@ void play_dead_common(void)
>  	local_irq_disable();
>  }
>  
> +int mwait_play_dead_with_hint(unsigned long eax_hint);

We generally prefer these get a declaration in a header or get moved
around in the file, not declared like this.

>  /*
>   * We need to flush the caches before going to sleep, lest we have
>   * dirty data in our caches when we come back up.
>   */
> -static inline void mwait_play_dead(void)
> +static inline int mwait_play_dead(void)
>  {
> -	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
>  	unsigned int eax, ebx, ecx, edx;
>  	unsigned int highest_cstate = 0;
>  	unsigned int highest_subcstate = 0;
> @@ -1284,13 +1285,13 @@ static inline void mwait_play_dead(void)
>  
>  	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
>  	    boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
> -		return;
> +		return 1;
>  	if (!this_cpu_has(X86_FEATURE_MWAIT))
> -		return;
> +		return 1;
>  	if (!this_cpu_has(X86_FEATURE_CLFLUSH))
> -		return;
> +		return 1;
>  	if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
> -		return;
> +		return 1;

If you're going to use an 'int' for a fail/nofail return type, please
just use -ERRNO's. It makes it *MUCH* more clear that these are error
returns and not something else.

That's what cpuidle_play_dead() does, for instance,  Please follow its lead.

>  	eax = CPUID_MWAIT_LEAF;
>  	ecx = 0;
> @@ -1314,6 +1315,13 @@ static inline void mwait_play_dead(void)
>  			(highest_subcstate - 1);
>  	}
>  
> +	return mwait_play_dead_with_hint(eax);
> +}
> +
> +int mwait_play_dead_with_hint(unsigned long eax_hint)
> +{
> +	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
> +
>  	/* Set up state for the kexec() hack below */
>  	md->status = CPUDEAD_MWAIT_WAIT;
>  	md->control = CPUDEAD_MWAIT_WAIT;
> @@ -1333,7 +1341,7 @@ static inline void mwait_play_dead(void)
>  		mb();
>  		__monitor(md, 0, 0);
>  		mb();
> -		__mwait(eax, 0);
> +		__mwait(eax_hint, 0);
>  
>  		if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
>  			/*
> @@ -1353,6 +1361,9 @@ static inline void mwait_play_dead(void)
>  				native_halt();
>  		}
>  	}
> +
> +	/* Never reached */
> +	return 0;
>  }

I think the preferred way to do this is to add a __noreturn annotation.
Patryk Wlazlyn Nov. 12, 2024, 10:54 a.m. UTC | #2
> First, it is not clear what code this refers to.  It needs to be more clear.

ACK. I thought that code changes serve that purpose, but I can explicitly quote
the code I am reffering to.

> Second, this is not clear about the bug.  Start with being crystal clear
> about the problem and the impact:
>
>     MWAIT needs different hints on different CPUs to reach the most
>     efficient idle states. The hint calculation* works in practice
>     on current hardware, but it fails on newer ones. Those newer
>     CPUs' battery life really suffers when the system is suspended
>     like when a laptop lid is closed.
>
>      * The current calculation is the for loop inspecting edx in
>        mwait_play_dead()
>
> Then you can go into detail about what the bad implementation does.

It would kind of get into the bussiness of documenting the mwait instruction
itself. I am modifying behavior of already existing code.

Should I be more clear in the commit message or you also meant adding more code
comments?

>> For example Intel's Sierra Forest report two C6 substates in cpuid leaf 5:
>>     C6S  (hint 0x22)
>>     C6SP (hint 0x23)
>>
>> Hints 0x20 and 0x21 are skipped entirely,
>
> Why does it start at 0x20?  Giving the example on new, broken hardware
> is also a bit weak without an example from old, fully functional hardware.

It's just how mwait hints work on Intel hardware:
    EAX[3:0] # Sub C-state
    EAX[7:4] # C-state

CPUID leaf 0x5 report how many sub c-states are there for any c-state.
Old code assumes that you can derive the actual hints from that, but you can
only do that if available hints are continuous, which is not guaranteed and is
not true on SRF and possibly future platforms.

Hint 0x20 (C6) was given as an example, because it's where the problem is.

Mwait hints themselves are processor-specific and so the same hint may mean
different thing on different hardware.

>> causing the current
>> implementation to compute the wrong hint, when looking for the deepest
>> cstate for offlined CPU to enter. As a result, package with an offlined
>> CPU can never reach PC6.
>
> I don't think the PC6 detail matters enough to mention here.

I can simplify the commit message, but the PC6 problem is the reason this change
was proposed.

>> Allow the idle driver to call mwait_play_dead() code with the forced
>> mwait hint, skipping the cpuid based computation.
>>
>> Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@linux.intel.com>
>> ---
>>  arch/x86/include/asm/smp.h |  6 ++++++
>>  arch/x86/kernel/smpboot.c  | 25 ++++++++++++++++++-------
>>  2 files changed, 24 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
>> index ca073f40698f..fbd275d6661a 100644
>> --- a/arch/x86/include/asm/smp.h
>> +++ b/arch/x86/include/asm/smp.h
>> @@ -114,6 +114,7 @@ void wbinvd_on_cpu(int cpu);
>>  int wbinvd_on_all_cpus(void);
>>  
>>  void smp_kick_mwait_play_dead(void);
>> +int mwait_play_dead_with_hint(unsigned long hint);
>>  
>>  void native_smp_send_reschedule(int cpu);
>>  void native_send_call_func_ipi(const struct cpumask *mask);
>> @@ -164,6 +165,11 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
>>  {
>>      return (struct cpumask *)cpumask_of(0);
>>  }
>> +
>> +static inline int mwait_play_dead_with_hint(unsigned long eax_hint)
>> +{
>> +    return 1;
>> +}
>>  #endif /* CONFIG_SMP */
>>  
>>  #ifdef CONFIG_DEBUG_NMI_SELFTEST
>> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
>> index 0c35207320cb..44c40781bad6 100644
>> --- a/arch/x86/kernel/smpboot.c
>> +++ b/arch/x86/kernel/smpboot.c
>> @@ -1270,13 +1270,14 @@ void play_dead_common(void)
>>      local_irq_disable();
>>  }
>>  
>> +int mwait_play_dead_with_hint(unsigned long eax_hint);
>
> We generally prefer these get a declaration in a header or get moved
> around in the file, not declared like this.

If forward declared and defined after mwait_play_dead, it makes the git
diff/blame simpler. As an alternative I could define it above, but the resulting
diff is harder to read and I didn't want to include whole smp header just for
that, but perhaps I should.

>>  /*
>>   * We need to flush the caches before going to sleep, lest we have
>>   * dirty data in our caches when we come back up.
>>   */
>> -static inline void mwait_play_dead(void)
>> +static inline int mwait_play_dead(void)
>>  {
>> -    struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
>>      unsigned int eax, ebx, ecx, edx;
>>      unsigned int highest_cstate = 0;
>>      unsigned int highest_subcstate = 0;
>> @@ -1284,13 +1285,13 @@ static inline void mwait_play_dead(void)
>>  
>>      if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
>>          boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
>> -        return;
>> +        return 1;
>>      if (!this_cpu_has(X86_FEATURE_MWAIT))
>> -        return;
>> +        return 1;
>>      if (!this_cpu_has(X86_FEATURE_CLFLUSH))
>> -        return;
>> +        return 1;
>>      if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
>> -        return;
>> +        return 1;
>
> If you're going to use an 'int' for a fail/nofail return type, please
> just use -ERRNO's. It makes it *MUCH* more clear that these are error
> returns and not something else.
>
> That's what cpuidle_play_dead() does, for instance,  Please follow its lead.

ACK. Makes sense.

>>      eax = CPUID_MWAIT_LEAF;
>>      ecx = 0;
>> @@ -1314,6 +1315,13 @@ static inline void mwait_play_dead(void)
>>              (highest_subcstate - 1);
>>      }
>>  
>> +    return mwait_play_dead_with_hint(eax);
>> +}
>> +
>> +int mwait_play_dead_with_hint(unsigned long eax_hint)
>> +{
>> +    struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
>> +
>>      /* Set up state for the kexec() hack below */
>>      md->status = CPUDEAD_MWAIT_WAIT;
>>      md->control = CPUDEAD_MWAIT_WAIT;
>> @@ -1333,7 +1341,7 @@ static inline void mwait_play_dead(void)
>>          mb();
>>          __monitor(md, 0, 0);
>>          mb();
>> -        __mwait(eax, 0);
>> +        __mwait(eax_hint, 0);
>>  
>>          if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
>>              /*
>> @@ -1353,6 +1361,9 @@ static inline void mwait_play_dead(void)
>>                  native_halt();
>>          }
>>      }
>> +
>> +    /* Never reached */
>> +    return 0;
>>  }
>
> I think the preferred way to do this is to add a __noreturn annotation.

OK.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index ca073f40698f..fbd275d6661a 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -114,6 +114,7 @@  void wbinvd_on_cpu(int cpu);
 int wbinvd_on_all_cpus(void);
 
 void smp_kick_mwait_play_dead(void);
+int mwait_play_dead_with_hint(unsigned long hint);
 
 void native_smp_send_reschedule(int cpu);
 void native_send_call_func_ipi(const struct cpumask *mask);
@@ -164,6 +165,11 @@  static inline struct cpumask *cpu_llc_shared_mask(int cpu)
 {
 	return (struct cpumask *)cpumask_of(0);
 }
+
+static inline int mwait_play_dead_with_hint(unsigned long eax_hint)
+{
+	return 1;
+}
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_DEBUG_NMI_SELFTEST
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0c35207320cb..44c40781bad6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1270,13 +1270,14 @@  void play_dead_common(void)
 	local_irq_disable();
 }
 
+int mwait_play_dead_with_hint(unsigned long eax_hint);
+
 /*
  * We need to flush the caches before going to sleep, lest we have
  * dirty data in our caches when we come back up.
  */
-static inline void mwait_play_dead(void)
+static inline int mwait_play_dead(void)
 {
-	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
 	unsigned int eax, ebx, ecx, edx;
 	unsigned int highest_cstate = 0;
 	unsigned int highest_subcstate = 0;
@@ -1284,13 +1285,13 @@  static inline void mwait_play_dead(void)
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
 	    boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
-		return;
+		return 1;
 	if (!this_cpu_has(X86_FEATURE_MWAIT))
-		return;
+		return 1;
 	if (!this_cpu_has(X86_FEATURE_CLFLUSH))
-		return;
+		return 1;
 	if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
-		return;
+		return 1;
 
 	eax = CPUID_MWAIT_LEAF;
 	ecx = 0;
@@ -1314,6 +1315,13 @@  static inline void mwait_play_dead(void)
 			(highest_subcstate - 1);
 	}
 
+	return mwait_play_dead_with_hint(eax);
+}
+
+int mwait_play_dead_with_hint(unsigned long eax_hint)
+{
+	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
+
 	/* Set up state for the kexec() hack below */
 	md->status = CPUDEAD_MWAIT_WAIT;
 	md->control = CPUDEAD_MWAIT_WAIT;
@@ -1333,7 +1341,7 @@  static inline void mwait_play_dead(void)
 		mb();
 		__monitor(md, 0, 0);
 		mb();
-		__mwait(eax, 0);
+		__mwait(eax_hint, 0);
 
 		if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
 			/*
@@ -1353,6 +1361,9 @@  static inline void mwait_play_dead(void)
 				native_halt();
 		}
 	}
+
+	/* Never reached */
+	return 0;
 }
 
 /*