diff mbox series

[v3,1/2] x86/mwait: Add support for idle via umwait

Message ID 20230610183518.4061159-2-dedekind1@gmail.com (mailing list archive)
State Superseded, archived
Delegated to: Rafael Wysocki
Headers show
Series Sapphire Rapids C0.x idle states support | expand

Commit Message

Artem Bityutskiy June 10, 2023, 6:35 p.m. UTC
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>

On Intel platforms, C-states are requested using the 'monitor/mwait'
instructions pair, as implemented in 'mwait_idle_with_hints()'. This
mechanism allows for entering C1 and deeper C-states.

Sapphire Rapids Xeon supports new idle states - C0.1 and C0.2 (later C0.x).
These idle states have lower latency comparing to C1, and can be requested
with either 'tpause' and 'umwait' instructions.

Linux already uses the 'tpause' instruction in delay functions like
'udelay()'. This patch adds 'umwait' and 'umonitor' instructions support.

'umwait' and 'tpause' instructions are very similar - both send the CPU to
C0.x and have the same break out rules. But unlike 'tpause', 'umwait' works
together with 'umonitor' and exits the C0.x when the monitored memory
address is modified (similar idea as with 'monitor/mwait').

This patch implements the 'umwait_idle()' function, which works very
similarly to existing 'mwait_idle_with_hints()', but requests C0.x. The
intention is to use it from the 'intel_idle' driver.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
---
 arch/x86/include/asm/mwait.h | 65 ++++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

Comments

Rafael J. Wysocki June 29, 2023, 7:03 p.m. UTC | #1
On Sat, Jun 10, 2023 at 8:35 PM Artem Bityutskiy <dedekind1@gmail.com> wrote:
>
> From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
>
> On Intel platforms, C-states are requested using the 'monitor/mwait'
> instructions pair, as implemented in 'mwait_idle_with_hints()'. This
> mechanism allows for entering C1 and deeper C-states.
>
> Sapphire Rapids Xeon supports new idle states - C0.1 and C0.2 (later C0.x).
> These idle states have lower latency comparing to C1, and can be requested
> with either 'tpause' and 'umwait' instructions.
>
> Linux already uses the 'tpause' instruction in delay functions like
> 'udelay()'. This patch adds 'umwait' and 'umonitor' instructions support.
>
> 'umwait' and 'tpause' instructions are very similar - both send the CPU to
> C0.x and have the same break out rules. But unlike 'tpause', 'umwait' works
> together with 'umonitor' and exits the C0.x when the monitored memory
> address is modified (similar idea as with 'monitor/mwait').
>
> This patch implements the 'umwait_idle()' function, which works very
> similarly to existing 'mwait_idle_with_hints()', but requests C0.x. The
> intention is to use it from the 'intel_idle' driver.
>
> Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>

x86 folks, any comments on this?

Barring any concerns, I would like to queue it up for 6.6 when the
merge is over.

> ---
>  arch/x86/include/asm/mwait.h | 65 ++++++++++++++++++++++++++++++++++++
>  1 file changed, 65 insertions(+)
>
> diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
> index 778df05f8539..681c281eeaa7 100644
> --- a/arch/x86/include/asm/mwait.h
> +++ b/arch/x86/include/asm/mwait.h
> @@ -141,4 +141,69 @@ static inline void __tpause(u32 ecx, u32 edx, u32 eax)
>         #endif
>  }
>
> +#ifdef CONFIG_X86_64
> +/*
> + * Monitor a memory address at 'rcx' using the 'umonitor' instruction.
> + */
> +static inline void __umonitor(const void *rcx)
> +{
> +       /* "umonitor %rcx" */
> +#ifdef CONFIG_AS_TPAUSE
> +       asm volatile("umonitor %%rcx\n"
> +                    :
> +                    : "c"(rcx));
> +#else
> +       asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf1\t\n"
> +                    :
> +                    : "c"(rcx));
> +#endif
> +}
> +
> +/*
> + * Same as '__tpause()', but uses the 'umwait' instruction. It is very
> + * similar to 'tpause', but also breaks out if the data at the address
> + * monitored with 'umonitor' is modified.
> + */
> +static inline void __umwait(u32 ecx, u32 edx, u32 eax)
> +{
> +       /* "umwait %ecx, %edx, %eax;" */
> +#ifdef CONFIG_AS_TPAUSE
> +       asm volatile("umwait %%ecx\n"
> +                    :
> +                    : "c"(ecx), "d"(edx), "a"(eax));
> +#else
> +       asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf1\t\n"
> +                    :
> +                    : "c"(ecx), "d"(edx), "a"(eax));
> +#endif
> +}
> +
> +/*
> + * Enter C0.1 or C0.2 state and stay there until an event happens (an interrupt
> + * or the 'need_resched()'), the explicit deadline is reached, or the implicit
> + * global limit is reached.
> + *
> + * The deadline is the absolute TSC value to exit the idle state at. If it
> + * exceeds the global limit in the 'IA32_UMWAIT_CONTROL' register, the global
> + * limit prevails, and the idle state is exited earlier than the deadline.
> + */
> +static inline void umwait_idle(u64 deadline, u32 state)
> +{
> +       if (!current_set_polling_and_test()) {
> +               u32 eax, edx;
> +
> +               eax = lower_32_bits(deadline);
> +               edx = upper_32_bits(deadline);
> +
> +               __umonitor(&current_thread_info()->flags);
> +               if (!need_resched())
> +                       __umwait(state, edx, eax);
> +       }
> +       current_clr_polling();
> +}
> +#else
> +#define umwait_idle(deadline, state) \
> +               WARN_ONCE(1, "umwait CPU instruction is not supported")
> +#endif /* CONFIG_X86_64 */
> +
>  #endif /* _ASM_X86_MWAIT_H */
> --
> 2.40.1
>
Thomas Gleixner June 29, 2023, 10:04 p.m. UTC | #2
On Sat, Jun 10 2023 at 21:35, Artem Bityutskiy wrote:
> On Intel platforms, C-states are requested using the 'monitor/mwait'
> instructions pair, as implemented in 'mwait_idle_with_hints()'. This
> mechanism allows for entering C1 and deeper C-states.
>
> Sapphire Rapids Xeon supports new idle states - C0.1 and C0.2 (later C0.x).
> These idle states have lower latency comparing to C1, and can be requested
> with either 'tpause' and 'umwait' instructions.

s/and/or/

>
> Linux already uses the 'tpause' instruction in delay functions like
> 'udelay()'. This patch adds 'umwait' and 'umonitor' instructions
> support.

# git grep 'This patch' Documentation/process/

Please fix it all over the place.

> +#ifdef CONFIG_X86_64
> +/*
> + * Monitor a memory address at 'rcx' using the 'umonitor' instruction.
> + */
> +static inline void __umonitor(const void *rcx)
> +{
> +	/* "umonitor %rcx" */
> +#ifdef CONFIG_AS_TPAUSE

Are you sure that the instruction check for TPAUSE is sufficient to also
include UMONITOR on all toolchains which support TPAUSE?

Also:

          if (IS_ENABLED(CONFIG_AS_TPAUSE) {

> +	asm volatile("umonitor %%rcx\n"
> +		     :
> +		     : "c"(rcx));
          } else {

> +#else
> +	asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf1\t\n"
> +		     :
> +		     : "c"(rcx));

        }

please.

> +/*
> + * Same as '__tpause()', but uses the 'umwait' instruction. It is very
> + * similar to 'tpause', but also breaks out if the data at the address
> + * monitored with 'umonitor' is modified.
> + */
> +static inline void __umwait(u32 ecx, u32 edx, u32 eax)
> +{
> +	/* "umwait %ecx, %edx, %eax;" */
> +#ifdef CONFIG_AS_TPAUSE
> +	asm volatile("umwait %%ecx\n"
> +		     :
> +		     : "c"(ecx), "d"(edx), "a"(eax));
> +#else
> +	asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf1\t\n"
> +		     :
> +		     : "c"(ecx), "d"(edx), "a"(eax));
> +#endif

Ditto.

> +/*
> + * Enter C0.1 or C0.2 state and stay there until an event happens (an interrupt
> + * or the 'need_resched()'), the explicit deadline is reached, or the implicit
> + * global limit is reached.
> + *
> + * The deadline is the absolute TSC value to exit the idle state at. If it
> + * exceeds the global limit in the 'IA32_UMWAIT_CONTROL' register, the global
> + * limit prevails, and the idle state is exited earlier than the deadline.
> + */
> +static inline void umwait_idle(u64 deadline, u32 state)
> +{
> +	if (!current_set_polling_and_test()) {
> +		u32 eax, edx;
> +
> +		eax = lower_32_bits(deadline);
> +		edx = upper_32_bits(deadline);
> +
> +		__umonitor(&current_thread_info()->flags);
> +		if (!need_resched())
> +			__umwait(state, edx, eax);
> +	}
> +	current_clr_polling();
> +}
> +#else
> +#define umwait_idle(deadline, state) \
> +		WARN_ONCE(1, "umwait CPU instruction is not supported")

Please implement the stub as a proper inline.

> +#endif /* CONFIG_X86_64 */

This comment is wrong.

#ifdef CONFIG_X86_64
       ....
#else /* CONFIG_X86_64 */

#endif /* !CONFIG_X86_64 */

makes it clear what the scope is.

Thanks,

        tglx
Thomas Gleixner June 29, 2023, 10:36 p.m. UTC | #3
On Fri, Jun 30 2023 at 00:04, Thomas Gleixner wrote:
>> +static inline void __umonitor(const void *rcx)

These inlines must be __always_inline to prevent the compiler from
instrumenting this.

For further information see:

  0e985e9d2286 ("cpuidle: Add comments about noinstr/__cpuidle usage")
  2b5a0e425e6e ("objtool/idle: Validate __cpuidle code as noinstr")

Thanks,

        tglx
Artem Bityutskiy June 30, 2023, 4:54 p.m. UTC | #4
On Fri, 2023-06-30 at 00:04 +0200, Thomas Gleixner wrote:
> > +#ifdef CONFIG_X86_64
> > +/*
> > + * Monitor a memory address at 'rcx' using the 'umonitor' instruction.
> > + */
> > +static inline void __umonitor(const void *rcx)
> > +{
> > +       /* "umonitor %rcx" */
> > +#ifdef CONFIG_AS_TPAUSE
> 
> Are you sure that the instruction check for TPAUSE is sufficient to also
> include UMONITOR on all toolchains which support TPAUSE?

Good point, I checked only GNU compiler. I can check Clang/LLVM.


Will also address other issues that you pointed, thanks!

Artem.
Artem Bityutskiy July 7, 2023, 5:13 p.m. UTC | #5
On Fri, 2023-06-30 at 00:04 +0200, Thomas Gleixner wrote:
> > +#ifdef CONFIG_X86_64
> > +/*
> > + * Monitor a memory address at 'rcx' using the 'umonitor' instruction.
> > + */
> > +static inline void __umonitor(const void *rcx)
> > +{
> > +       /* "umonitor %rcx" */
> > +#ifdef CONFIG_AS_TPAUSE
> 
> Are you sure that the instruction check for TPAUSE is sufficient to also
> include UMONITOR on all toolchains which support TPAUSE?

I've verified by building the kernel with gcc/binutils and clang/LLVM.
Builds, boots, umwait works, C0.2 happens with both.

I inspected gcc, binutils, and clang/llvm git logs: support for
'tpause' and 'umwait' arrived in the same commit. Details below.

'tpause' and 'umwait' instructions are very similar, arrived together,
guarded together by CPUID.7's "MWAITPKG" bit. Based on this, I'd generally
expect toolchains to support both or none.

I can add a note about this in the commit message too.

Details on commits in the projects I checked.

1. binutils-gcc git tree:
   de89d0a34d5 Enable Intel WAITPKG instructions.

2. gcc git tree:
   55f31ed10fd i386-common.c (OPTION_MASK_ISA_WAITPKG_SET, [...]): New defines.

3. llvm-project git tree:
   2e02579a76cf [OpenMP] Add use of TPAUSE

It'll take some time to re-test and and partially re-measure power/perf,
so I'll send new version a bit later.

Thanks,
Artem.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 778df05f8539..681c281eeaa7 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -141,4 +141,69 @@  static inline void __tpause(u32 ecx, u32 edx, u32 eax)
 	#endif
 }
 
+#ifdef CONFIG_X86_64
+/*
+ * Monitor a memory address at 'rcx' using the 'umonitor' instruction.
+ */
+static inline void __umonitor(const void *rcx)
+{
+	/* "umonitor %rcx" */
+#ifdef CONFIG_AS_TPAUSE
+	asm volatile("umonitor %%rcx\n"
+		     :
+		     : "c"(rcx));
+#else
+	asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf1\t\n"
+		     :
+		     : "c"(rcx));
+#endif
+}
+
+/*
+ * Same as '__tpause()', but uses the 'umwait' instruction. It is very
+ * similar to 'tpause', but also breaks out if the data at the address
+ * monitored with 'umonitor' is modified.
+ */
+static inline void __umwait(u32 ecx, u32 edx, u32 eax)
+{
+	/* "umwait %ecx, %edx, %eax;" */
+#ifdef CONFIG_AS_TPAUSE
+	asm volatile("umwait %%ecx\n"
+		     :
+		     : "c"(ecx), "d"(edx), "a"(eax));
+#else
+	asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf1\t\n"
+		     :
+		     : "c"(ecx), "d"(edx), "a"(eax));
+#endif
+}
+
+/*
+ * Enter C0.1 or C0.2 state and stay there until an event happens (an interrupt
+ * or the 'need_resched()'), the explicit deadline is reached, or the implicit
+ * global limit is reached.
+ *
+ * The deadline is the absolute TSC value to exit the idle state at. If it
+ * exceeds the global limit in the 'IA32_UMWAIT_CONTROL' register, the global
+ * limit prevails, and the idle state is exited earlier than the deadline.
+ */
+static inline void umwait_idle(u64 deadline, u32 state)
+{
+	if (!current_set_polling_and_test()) {
+		u32 eax, edx;
+
+		eax = lower_32_bits(deadline);
+		edx = upper_32_bits(deadline);
+
+		__umonitor(&current_thread_info()->flags);
+		if (!need_resched())
+			__umwait(state, edx, eax);
+	}
+	current_clr_polling();
+}
+#else
+#define umwait_idle(deadline, state) \
+		WARN_ONCE(1, "umwait CPU instruction is not supported")
+#endif /* CONFIG_X86_64 */
+
 #endif /* _ASM_X86_MWAIT_H */