diff mbox series

[RFC,v2,15/20] context-tracking: Introduce work deferral infrastructure

Message ID 20230720163056.2564824-16-vschneid@redhat.com (mailing list archive)
State Superseded
Headers show
Series context_tracking,x86: Defer some IPIs until a user->kernel transition | expand

Commit Message

Valentin Schneider July 20, 2023, 4:30 p.m. UTC
smp_call_function() & friends have the unfortunate habit of sending IPIs to
isolated, NOHZ_FULL, in-userspace CPUs, as they blindly target all online
CPUs.

Some callsites can be bent into doing the right, such as done by commit:

  cc9e303c91f5 ("x86/cpu: Disable frequency requests via aperfmperf IPI for nohz_full CPUs")

Unfortunately, not all SMP callbacks can be omitted in this
fashion. However, some of them only affect execution in kernelspace, which
means they don't have to be executed *immediately* if the target CPU is in
userspace: stashing the callback and executing it upon the next kernel entry
would suffice. x86 kernel instruction patching or kernel TLB invalidation
are prime examples of it.

Reduce the RCU dynticks counter width to free up some bits to be used as a
deferred callback bitmask. Add some build-time checks to validate that
setup.

Presence of CONTEXT_KERNEL in the ct_state prevents queuing deferred work.

Later commits introduce the bit:callback mappings.

Link: https://lore.kernel.org/all/20210929151723.162004989@infradead.org/
Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
---
 arch/Kconfig                                 |  9 +++
 arch/x86/Kconfig                             |  1 +
 arch/x86/include/asm/context_tracking_work.h | 14 +++++
 include/linux/context_tracking.h             | 25 ++++++++
 include/linux/context_tracking_state.h       | 62 +++++++++++++++-----
 include/linux/context_tracking_work.h        | 26 ++++++++
 kernel/context_tracking.c                    | 51 +++++++++++++++-
 kernel/time/Kconfig                          |  5 ++
 8 files changed, 176 insertions(+), 17 deletions(-)
 create mode 100644 arch/x86/include/asm/context_tracking_work.h
 create mode 100644 include/linux/context_tracking_work.h

Comments

Frederic Weisbecker July 24, 2023, 2:52 p.m. UTC | #1
Le Thu, Jul 20, 2023 at 05:30:51PM +0100, Valentin Schneider a écrit :
> +enum ctx_state {
> +	/* Following are values */
> +	CONTEXT_DISABLED	= -1,	/* returned by ct_state() if unknown */
> +	CONTEXT_KERNEL		= 0,
> +	CONTEXT_IDLE		= 1,
> +	CONTEXT_USER		= 2,
> +	CONTEXT_GUEST		= 3,
> +	CONTEXT_MAX             = 4,
> +};
> +
> +/*
> + * We cram three different things within the same atomic variable:
> + *
> + *                CONTEXT_STATE_END                        RCU_DYNTICKS_END
> + *                         |       CONTEXT_WORK_END                |
> + *                         |               |                       |
> + *                         v               v                       v
> + *         [ context_state ][ context work ][ RCU dynticks counter ]
> + *         ^                ^               ^
> + *         |                |               |
> + *         |        CONTEXT_WORK_START      |
> + * CONTEXT_STATE_START              RCU_DYNTICKS_START

Should the layout be displayed in reverse? Well at least I always picture
bitmaps in reverse, that's probably due to the direction of the shift arrows.
Not sure what is the usual way to picture it though...

> + */
> +
> +#define CT_STATE_SIZE (sizeof(((struct context_tracking *)0)->state) * BITS_PER_BYTE)
> +
> +#define CONTEXT_STATE_START 0
> +#define CONTEXT_STATE_END   (bits_per(CONTEXT_MAX - 1) - 1)

Since you have non overlapping *_START symbols, perhaps the *_END
are superfluous?

> +
> +#define RCU_DYNTICKS_BITS  (IS_ENABLED(CONFIG_CONTEXT_TRACKING_WORK) ? 16 : 31)
> +#define RCU_DYNTICKS_START (CT_STATE_SIZE - RCU_DYNTICKS_BITS)
> +#define RCU_DYNTICKS_END   (CT_STATE_SIZE - 1)
> +#define RCU_DYNTICKS_IDX   BIT(RCU_DYNTICKS_START)

Might be the right time to standardize and fix our naming:

CT_STATE_START,
CT_STATE_KERNEL,
CT_STATE_USER,
...
CT_WORK_START,
CT_WORK_*,
...
CT_RCU_DYNTICKS_START,
CT_RCU_DYNTICKS_IDX

> +bool ct_set_cpu_work(unsigned int cpu, unsigned int work)
> +{
> +	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
> +	unsigned int old;
> +	bool ret = false;
> +
> +	preempt_disable();
> +
> +	old = atomic_read(&ct->state);
> +	/*
> +	 * Try setting the work until either
> +	 * - the target CPU no longer accepts any more deferred work
> +	 * - the work has been set
> +	 *
> +	 * NOTE: CONTEXT_GUEST intersects with CONTEXT_USER and CONTEXT_IDLE
> +	 * as they are regular integers rather than bits, but that doesn't
> +	 * matter here: if any of the context state bit is set, the CPU isn't
> +	 * in kernel context.
> +	 */
> +	while ((old & (CONTEXT_GUEST | CONTEXT_USER | CONTEXT_IDLE)) && !ret)

That may still miss a recent entry to userspace due to the first plain read, ending
with an undesired interrupt.

You need at least one cmpxchg. Well, of course that stays racy by nature because
between the cmpxchg() returning CONTEXT_KERNEL and the actual IPI raised and
received, the remote CPU may have gone to userspace already. But still it limits
a little the window.

Thanks.

> +		ret = atomic_try_cmpxchg(&ct->state, &old, old | (work << CONTEXT_WORK_START));
> +
> +	preempt_enable();
> +	return ret;
> +}
> +#else
> +static __always_inline void ct_work_flush(unsigned long work) { }
> +static __always_inline void ct_work_clear(struct context_tracking *ct) { }
> +#endif
> +
>  /*
>   * Record entry into an extended quiescent state.  This is only to be
>   * called when not already in an extended quiescent state, that is,
> @@ -88,7 +133,8 @@ static noinstr void ct_kernel_exit_state(int offset)
>  	 * next idle sojourn.
>  	 */
>  	rcu_dynticks_task_trace_enter();  // Before ->dynticks update!
> -	seq = ct_state_inc(offset);
> +	seq = ct_state_inc_clear_work(offset);
> +
>  	// RCU is no longer watching.  Better be in extended quiescent state!
>  	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX));
>  }
> @@ -100,7 +146,7 @@ static noinstr void ct_kernel_exit_state(int offset)
>   */
>  static noinstr void ct_kernel_enter_state(int offset)
>  {
> -	int seq;
> +	unsigned long seq;
>  
>  	/*
>  	 * CPUs seeing atomic_add_return() must see prior idle sojourns,
> @@ -108,6 +154,7 @@ static noinstr void ct_kernel_enter_state(int offset)
>  	 * critical section.
>  	 */
>  	seq = ct_state_inc(offset);
> +	ct_work_flush(seq);
>  	// RCU is now watching.  Better not be in an extended quiescent state!
>  	rcu_dynticks_task_trace_exit();  // After ->dynticks update!
>  	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX));
> diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
> index bae8f11070bef..fdb266f2d774b 100644
> --- a/kernel/time/Kconfig
> +++ b/kernel/time/Kconfig
> @@ -181,6 +181,11 @@ config CONTEXT_TRACKING_USER_FORCE
>  	  Say N otherwise, this option brings an overhead that you
>  	  don't want in production.
>  
> +config CONTEXT_TRACKING_WORK
> +	bool
> +	depends on HAVE_CONTEXT_TRACKING_WORK && CONTEXT_TRACKING_USER
> +	default y
> +
>  config NO_HZ
>  	bool "Old Idle dynticks config"
>  	help
> -- 
> 2.31.1
>
Valentin Schneider July 24, 2023, 4:55 p.m. UTC | #2
On 24/07/23 16:52, Frederic Weisbecker wrote:
> Le Thu, Jul 20, 2023 at 05:30:51PM +0100, Valentin Schneider a écrit :
>> +enum ctx_state {
>> +	/* Following are values */
>> +	CONTEXT_DISABLED	= -1,	/* returned by ct_state() if unknown */
>> +	CONTEXT_KERNEL		= 0,
>> +	CONTEXT_IDLE		= 1,
>> +	CONTEXT_USER		= 2,
>> +	CONTEXT_GUEST		= 3,
>> +	CONTEXT_MAX             = 4,
>> +};
>> +
>> +/*
>> + * We cram three different things within the same atomic variable:
>> + *
>> + *                CONTEXT_STATE_END                        RCU_DYNTICKS_END
>> + *                         |       CONTEXT_WORK_END                |
>> + *                         |               |                       |
>> + *                         v               v                       v
>> + *         [ context_state ][ context work ][ RCU dynticks counter ]
>> + *         ^                ^               ^
>> + *         |                |               |
>> + *         |        CONTEXT_WORK_START      |
>> + * CONTEXT_STATE_START              RCU_DYNTICKS_START
>
> Should the layout be displayed in reverse? Well at least I always picture
> bitmaps in reverse, that's probably due to the direction of the shift arrows.
> Not sure what is the usual way to picture it though...
>

Surprisingly, I managed to confuse myself with that comment :-)  I think I
am subconsciously more used to the reverse as well. I've flipped that and
put "MSB" / "LSB" at either end.

>> + */
>> +
>> +#define CT_STATE_SIZE (sizeof(((struct context_tracking *)0)->state) * BITS_PER_BYTE)
>> +
>> +#define CONTEXT_STATE_START 0
>> +#define CONTEXT_STATE_END   (bits_per(CONTEXT_MAX - 1) - 1)
>
> Since you have non overlapping *_START symbols, perhaps the *_END
> are superfluous?
>

They're only really there to tidy up the GENMASK() further down - it keeps
the range and index definitions in one hunk. I tried defining that directly
within the GENMASK() themselves but it got too ugly IMO.

>> +
>> +#define RCU_DYNTICKS_BITS  (IS_ENABLED(CONFIG_CONTEXT_TRACKING_WORK) ? 16 : 31)
>> +#define RCU_DYNTICKS_START (CT_STATE_SIZE - RCU_DYNTICKS_BITS)
>> +#define RCU_DYNTICKS_END   (CT_STATE_SIZE - 1)
>> +#define RCU_DYNTICKS_IDX   BIT(RCU_DYNTICKS_START)
>
> Might be the right time to standardize and fix our naming:
>
> CT_STATE_START,
> CT_STATE_KERNEL,
> CT_STATE_USER,
> ...
> CT_WORK_START,
> CT_WORK_*,
> ...
> CT_RCU_DYNTICKS_START,
> CT_RCU_DYNTICKS_IDX
>

Heh, I have actually already done this for v3, though I hadn't touched the
RCU_DYNTICKS* family. I'll fold that in.

>> +bool ct_set_cpu_work(unsigned int cpu, unsigned int work)
>> +{
>> +	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
>> +	unsigned int old;
>> +	bool ret = false;
>> +
>> +	preempt_disable();
>> +
>> +	old = atomic_read(&ct->state);
>> +	/*
>> +	 * Try setting the work until either
>> +	 * - the target CPU no longer accepts any more deferred work
>> +	 * - the work has been set
>> +	 *
>> +	 * NOTE: CONTEXT_GUEST intersects with CONTEXT_USER and CONTEXT_IDLE
>> +	 * as they are regular integers rather than bits, but that doesn't
>> +	 * matter here: if any of the context state bit is set, the CPU isn't
>> +	 * in kernel context.
>> +	 */
>> +	while ((old & (CONTEXT_GUEST | CONTEXT_USER | CONTEXT_IDLE)) && !ret)
>
> That may still miss a recent entry to userspace due to the first plain read, ending
> with an undesired interrupt.
>
> You need at least one cmpxchg. Well, of course that stays racy by nature because
> between the cmpxchg() returning CONTEXT_KERNEL and the actual IPI raised and
> received, the remote CPU may have gone to userspace already. But still it limits
> a little the window.
>

I can make that a 'do {} while ()' instead to force at least one execution
of the cmpxchg().

This is only about reducing the race window, right? If we're executing this
just as the target CPU is about to enter userspace, we're going to be in
racy territory anyway. Regardless, I'm happy to do that change.
Frederic Weisbecker July 24, 2023, 7:18 p.m. UTC | #3
On Mon, Jul 24, 2023 at 05:55:44PM +0100, Valentin Schneider wrote:
> I can make that a 'do {} while ()' instead to force at least one execution
> of the cmpxchg().
> 
> This is only about reducing the race window, right? If we're executing this
> just as the target CPU is about to enter userspace, we're going to be in
> racy territory anyway. Regardless, I'm happy to do that change.

Right, it's only about narrowing down the race window. It probably don't matter
in practice, but it's one less thing to consider for the brain :-)

Also, why bothering with handling CONTEXT_IDLE?

Thanks.
Valentin Schneider July 25, 2023, 10:10 a.m. UTC | #4
On 24/07/23 21:18, Frederic Weisbecker wrote:
> On Mon, Jul 24, 2023 at 05:55:44PM +0100, Valentin Schneider wrote:
>> I can make that a 'do {} while ()' instead to force at least one execution
>> of the cmpxchg().
>>
>> This is only about reducing the race window, right? If we're executing this
>> just as the target CPU is about to enter userspace, we're going to be in
>> racy territory anyway. Regardless, I'm happy to do that change.
>
> Right, it's only about narrowing down the race window. It probably don't matter
> in practice, but it's one less thing to consider for the brain :-)
>

Ack

> Also, why bothering with handling CONTEXT_IDLE?
>

I have reasons! I just swept them under the rug and didn't mention them :D
Also looking at the config dependencies again I got it wrong, but
nevertheless that means I get to ramble about it.

With NO_HZ_IDLE, we get CONTEXT_TRACKING_IDLE, so we get these
transitions:

  ct_idle_enter()
    ct_kernel_exit()
      ct_state_inc_clear_work()

  ct_idle_exit()
    ct_kernel_enter()
      ct_work_flush()

Now, if we just make CONTEXT_TRACKING_WORK depend on CONTEXT_TRACKING_IDLE
rather than CONTEXT_TRACKING_USER, we get to leverage the IPI deferral for
NO_HZ_IDLE kernels - in other words, we get to keep idle CPUs idle longer.

It's a completely different argument than reducing interference for
NOHZ_FULL userspace applications and I should have at the very least
mentioned it in the cover letter, but it's the exact same backing
mechanism.

Looking at it again, I'll probably make the CONTEXT_IDLE thing a separate
patch with a proper changelog.
Frederic Weisbecker July 25, 2023, 11:22 a.m. UTC | #5
On Tue, Jul 25, 2023 at 11:10:31AM +0100, Valentin Schneider wrote:
> I have reasons! I just swept them under the rug and didn't mention them :D
> Also looking at the config dependencies again I got it wrong, but
> nevertheless that means I get to ramble about it.
> 
> With NO_HZ_IDLE, we get CONTEXT_TRACKING_IDLE, so we get these
> transitions:
> 
>   ct_idle_enter()
>     ct_kernel_exit()
>       ct_state_inc_clear_work()
> 
>   ct_idle_exit()
>     ct_kernel_enter()
>       ct_work_flush()
> 
> Now, if we just make CONTEXT_TRACKING_WORK depend on CONTEXT_TRACKING_IDLE
> rather than CONTEXT_TRACKING_USER, we get to leverage the IPI deferral for
> NO_HZ_IDLE kernels - in other words, we get to keep idle CPUs idle longer.
> 
> It's a completely different argument than reducing interference for
> NOHZ_FULL userspace applications and I should have at the very least
> mentioned it in the cover letter, but it's the exact same backing
> mechanism.
> 
> Looking at it again, I'll probably make the CONTEXT_IDLE thing a separate
> patch with a proper changelog.

Ok should that be a seperate Kconfig? This indeed can bring power improvement
but at the cost of more overhead from the sender. A balance to be measured...
Valentin Schneider July 25, 2023, 1:05 p.m. UTC | #6
On 25/07/23 13:22, Frederic Weisbecker wrote:
> On Tue, Jul 25, 2023 at 11:10:31AM +0100, Valentin Schneider wrote:
>> I have reasons! I just swept them under the rug and didn't mention them :D
>> Also looking at the config dependencies again I got it wrong, but
>> nevertheless that means I get to ramble about it.
>>
>> With NO_HZ_IDLE, we get CONTEXT_TRACKING_IDLE, so we get these
>> transitions:
>>
>>   ct_idle_enter()
>>     ct_kernel_exit()
>>       ct_state_inc_clear_work()
>>
>>   ct_idle_exit()
>>     ct_kernel_enter()
>>       ct_work_flush()
>>
>> Now, if we just make CONTEXT_TRACKING_WORK depend on CONTEXT_TRACKING_IDLE
>> rather than CONTEXT_TRACKING_USER, we get to leverage the IPI deferral for
>> NO_HZ_IDLE kernels - in other words, we get to keep idle CPUs idle longer.
>>
>> It's a completely different argument than reducing interference for
>> NOHZ_FULL userspace applications and I should have at the very least
>> mentioned it in the cover letter, but it's the exact same backing
>> mechanism.
>>
>> Looking at it again, I'll probably make the CONTEXT_IDLE thing a separate
>> patch with a proper changelog.
>
> Ok should that be a seperate Kconfig? This indeed can bring power improvement
> but at the cost of more overhead from the sender. A balance to be measured...

Yep agreed, I'll make that an optional config.
diff mbox series

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index aff2746c8af28..1bcb3bbdddaad 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -871,6 +871,15 @@  config HAVE_CONTEXT_TRACKING_USER_OFFSTACK
 	  - No use of instrumentation, unless instrumentation_begin() got
 	    called.
 
+config HAVE_CONTEXT_TRACKING_WORK
+	bool
+	help
+	  Architecture supports deferring work while not in kernel context.
+	  This is especially useful on setups with isolated CPUs that might
+	  want to avoid being interrupted to perform housekeeping tasks (for
+	  ex. TLB invalidation or icache invalidation). The housekeeping
+	  operations are performed upon re-entering the kernel.
+
 config HAVE_TIF_NOHZ
 	bool
 	help
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7422db4097701..71481a80774f6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -198,6 +198,7 @@  config X86
 	select HAVE_CMPXCHG_LOCAL
 	select HAVE_CONTEXT_TRACKING_USER		if X86_64
 	select HAVE_CONTEXT_TRACKING_USER_OFFSTACK	if HAVE_CONTEXT_TRACKING_USER
+	select HAVE_CONTEXT_TRACKING_WORK		if X86_64
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_OBJTOOL_MCOUNT		if HAVE_OBJTOOL
 	select HAVE_OBJTOOL_NOP_MCOUNT		if HAVE_OBJTOOL_MCOUNT
diff --git a/arch/x86/include/asm/context_tracking_work.h b/arch/x86/include/asm/context_tracking_work.h
new file mode 100644
index 0000000000000..5bc29e6b2ed38
--- /dev/null
+++ b/arch/x86/include/asm/context_tracking_work.h
@@ -0,0 +1,14 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CONTEXT_TRACKING_WORK_H
+#define _ASM_X86_CONTEXT_TRACKING_WORK_H
+
+static __always_inline void arch_context_tracking_work(int work)
+{
+	switch (work) {
+	case CONTEXT_WORK_n:
+		// Do work...
+		break;
+	}
+}
+
+#endif
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 6e76b9dba00e7..8aee086d0a25f 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -5,10 +5,15 @@ 
 #include <linux/sched.h>
 #include <linux/vtime.h>
 #include <linux/context_tracking_state.h>
+#include <linux/context_tracking_work.h>
 #include <linux/instrumentation.h>
 
 #include <asm/ptrace.h>
 
+#ifdef CONFIG_CONTEXT_TRACKING_WORK
+static_assert(CONTEXT_WORK_MAX_OFFSET <= CONTEXT_WORK_END + 1 - CONTEXT_WORK_START,
+	      "Not enough bits for CONTEXT_WORK");
+#endif
 
 #ifdef CONFIG_CONTEXT_TRACKING_USER
 extern void ct_cpu_track_user(int cpu);
@@ -131,6 +136,26 @@  static __always_inline unsigned long ct_state_inc(int incby)
 	return raw_atomic_add_return(incby, this_cpu_ptr(&context_tracking.state));
 }
 
+#ifdef CONTEXT_TRACKING_WORK
+static __always_inline unsigned long ct_state_inc_clear_work(int incby)
+{
+	struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+	unsigned long new, old, state;
+
+	state = arch_atomic_read(&ct->state);
+	do {
+		old = state;
+		new = old & ~CONTEXT_WORK_MASK;
+		new += incby;
+		state = arch_atomic_cmpxchg(&ct->state, old, new);
+	} while (old != state);
+
+	return new;
+}
+#else
+#define ct_state_inc_clear_work(x) ct_state_inc(x)
+#endif
+
 static __always_inline bool warn_rcu_enter(void)
 {
 	bool ret = false;
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index bbff5f7f88030..828fcdb801f73 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -9,21 +9,6 @@ 
 /* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
 #define DYNTICK_IRQ_NONIDLE	((LONG_MAX / 2) + 1)
 
-enum ctx_state {
-	CONTEXT_DISABLED	= -1,	/* returned by ct_state() if unknown */
-	CONTEXT_KERNEL		= 0,
-	CONTEXT_IDLE		= 1,
-	CONTEXT_USER		= 2,
-	CONTEXT_GUEST		= 3,
-	CONTEXT_MAX		= 4,
-};
-
-/* Even value for idle, else odd. */
-#define RCU_DYNTICKS_IDX CONTEXT_MAX
-
-#define CT_STATE_MASK (CONTEXT_MAX - 1)
-#define CT_DYNTICKS_MASK (~CT_STATE_MASK)
-
 struct context_tracking {
 #ifdef CONFIG_CONTEXT_TRACKING_USER
 	/*
@@ -44,6 +29,53 @@  struct context_tracking {
 #endif
 };
 
+enum ctx_state {
+	/* Following are values */
+	CONTEXT_DISABLED	= -1,	/* returned by ct_state() if unknown */
+	CONTEXT_KERNEL		= 0,
+	CONTEXT_IDLE		= 1,
+	CONTEXT_USER		= 2,
+	CONTEXT_GUEST		= 3,
+	CONTEXT_MAX             = 4,
+};
+
+/*
+ * We cram three different things within the same atomic variable:
+ *
+ *                CONTEXT_STATE_END                        RCU_DYNTICKS_END
+ *                         |       CONTEXT_WORK_END                |
+ *                         |               |                       |
+ *                         v               v                       v
+ *         [ context_state ][ context work ][ RCU dynticks counter ]
+ *         ^                ^               ^
+ *         |                |               |
+ *         |        CONTEXT_WORK_START      |
+ * CONTEXT_STATE_START              RCU_DYNTICKS_START
+ */
+
+#define CT_STATE_SIZE (sizeof(((struct context_tracking *)0)->state) * BITS_PER_BYTE)
+
+#define CONTEXT_STATE_START 0
+#define CONTEXT_STATE_END   (bits_per(CONTEXT_MAX - 1) - 1)
+
+#define RCU_DYNTICKS_BITS  (IS_ENABLED(CONFIG_CONTEXT_TRACKING_WORK) ? 16 : 31)
+#define RCU_DYNTICKS_START (CT_STATE_SIZE - RCU_DYNTICKS_BITS)
+#define RCU_DYNTICKS_END   (CT_STATE_SIZE - 1)
+#define RCU_DYNTICKS_IDX   BIT(RCU_DYNTICKS_START)
+
+#define	CONTEXT_WORK_START (CONTEXT_STATE_END + 1)
+#define CONTEXT_WORK_END   (RCU_DYNTICKS_START - 1)
+
+/* Make sure all our bits are accounted for */
+static_assert((CONTEXT_STATE_END + 1 - CONTEXT_STATE_START) +
+	      (CONTEXT_WORK_END  + 1 - CONTEXT_WORK_START) +
+	      (RCU_DYNTICKS_END  + 1 - RCU_DYNTICKS_START) ==
+	      CT_STATE_SIZE);
+
+#define CT_STATE_MASK GENMASK(CONTEXT_STATE_END, CONTEXT_STATE_START)
+#define CT_WORK_MASK  GENMASK(CONTEXT_WORK_END, CONTEXT_WORK_START)
+#define CT_DYNTICKS_MASK GENMASK(RCU_DYNTICKS_END, RCU_DYNTICKS_START)
+
 #ifdef CONFIG_CONTEXT_TRACKING
 DECLARE_PER_CPU(struct context_tracking, context_tracking);
 #endif
diff --git a/include/linux/context_tracking_work.h b/include/linux/context_tracking_work.h
new file mode 100644
index 0000000000000..fb74db8876dd2
--- /dev/null
+++ b/include/linux/context_tracking_work.h
@@ -0,0 +1,26 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CONTEXT_TRACKING_WORK_H
+#define _LINUX_CONTEXT_TRACKING_WORK_H
+
+#include <linux/bitops.h>
+
+enum {
+	CONTEXT_WORK_n_OFFSET,
+	CONTEXT_WORK_MAX_OFFSET
+};
+
+enum ct_work {
+	CONTEXT_WORK_n        = BIT(CONTEXT_WORK_n_OFFSET),
+	CONTEXT_WORK_MAX      = BIT(CONTEXT_WORK_MAX_OFFSET)
+};
+
+#include <asm/context_tracking_work.h>
+
+#ifdef CONFIG_CONTEXT_TRACKING_WORK
+extern bool ct_set_cpu_work(unsigned int cpu, unsigned int work);
+#else
+static inline bool
+ct_set_cpu_work(unsigned int cpu, unsigned int work) { return false; }
+#endif
+
+#endif
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index cc4f3a57f848c..1a3f6e355826d 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -72,6 +72,51 @@  static __always_inline void rcu_dynticks_task_trace_exit(void)
 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
 }
 
+#ifdef CONFIG_CONTEXT_TRACKING_WORK
+static noinstr void ct_work_flush(unsigned long seq)
+{
+	int bit;
+
+	seq = (seq & CT_WORK_MASK) >> CONTEXT_WORK_START;
+
+	/*
+	 * arch_context_tracking_work() must be noinstr, non-blocking,
+	 * and NMI safe.
+	 */
+	for_each_set_bit(bit, &seq, CONTEXT_WORK_MAX)
+		arch_context_tracking_work(BIT(bit));
+}
+
+bool ct_set_cpu_work(unsigned int cpu, unsigned int work)
+{
+	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
+	unsigned int old;
+	bool ret = false;
+
+	preempt_disable();
+
+	old = atomic_read(&ct->state);
+	/*
+	 * Try setting the work until either
+	 * - the target CPU no longer accepts any more deferred work
+	 * - the work has been set
+	 *
+	 * NOTE: CONTEXT_GUEST intersects with CONTEXT_USER and CONTEXT_IDLE
+	 * as they are regular integers rather than bits, but that doesn't
+	 * matter here: if any of the context state bit is set, the CPU isn't
+	 * in kernel context.
+	 */
+	while ((old & (CONTEXT_GUEST | CONTEXT_USER | CONTEXT_IDLE)) && !ret)
+		ret = atomic_try_cmpxchg(&ct->state, &old, old | (work << CONTEXT_WORK_START));
+
+	preempt_enable();
+	return ret;
+}
+#else
+static __always_inline void ct_work_flush(unsigned long work) { }
+static __always_inline void ct_work_clear(struct context_tracking *ct) { }
+#endif
+
 /*
  * Record entry into an extended quiescent state.  This is only to be
  * called when not already in an extended quiescent state, that is,
@@ -88,7 +133,8 @@  static noinstr void ct_kernel_exit_state(int offset)
 	 * next idle sojourn.
 	 */
 	rcu_dynticks_task_trace_enter();  // Before ->dynticks update!
-	seq = ct_state_inc(offset);
+	seq = ct_state_inc_clear_work(offset);
+
 	// RCU is no longer watching.  Better be in extended quiescent state!
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX));
 }
@@ -100,7 +146,7 @@  static noinstr void ct_kernel_exit_state(int offset)
  */
 static noinstr void ct_kernel_enter_state(int offset)
 {
-	int seq;
+	unsigned long seq;
 
 	/*
 	 * CPUs seeing atomic_add_return() must see prior idle sojourns,
@@ -108,6 +154,7 @@  static noinstr void ct_kernel_enter_state(int offset)
 	 * critical section.
 	 */
 	seq = ct_state_inc(offset);
+	ct_work_flush(seq);
 	// RCU is now watching.  Better not be in an extended quiescent state!
 	rcu_dynticks_task_trace_exit();  // After ->dynticks update!
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX));
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index bae8f11070bef..fdb266f2d774b 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -181,6 +181,11 @@  config CONTEXT_TRACKING_USER_FORCE
 	  Say N otherwise, this option brings an overhead that you
 	  don't want in production.
 
+config CONTEXT_TRACKING_WORK
+	bool
+	depends on HAVE_CONTEXT_TRACKING_WORK && CONTEXT_TRACKING_USER
+	default y
+
 config NO_HZ
 	bool "Old Idle dynticks config"
 	help