@@ -952,6 +952,15 @@ config HAVE_CONTEXT_TRACKING_USER_OFFSTACK
- No use of instrumentation, unless instrumentation_begin() got
called.
+config HAVE_CONTEXT_TRACKING_WORK
+ bool
+ help
+ Architecture supports deferring work while not in kernel context.
+ This is especially useful on setups with isolated CPUs that might
+ want to avoid being interrupted to perform housekeeping tasks (for
+ ex. TLB invalidation or icache invalidation). The housekeeping
+ operations are performed upon re-entering the kernel.
+
config HAVE_TIF_NOHZ
bool
help
@@ -216,6 +216,7 @@ config X86
select HAVE_CMPXCHG_LOCAL
select HAVE_CONTEXT_TRACKING_USER if X86_64
select HAVE_CONTEXT_TRACKING_USER_OFFSTACK if HAVE_CONTEXT_TRACKING_USER
+ select HAVE_CONTEXT_TRACKING_WORK if X86_64
select HAVE_C_RECORDMCOUNT
select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL
select HAVE_OBJTOOL_NOP_MCOUNT if HAVE_OBJTOOL_MCOUNT
new file mode 100644
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CONTEXT_TRACKING_WORK_H
+#define _ASM_X86_CONTEXT_TRACKING_WORK_H
+
+static __always_inline void arch_context_tracking_work(enum ct_work work)
+{
+ switch (work) {
+ case CT_WORK_n:
+ // Do work...
+ break;
+ case CT_WORK_MAX:
+ WARN_ON_ONCE(true);
+ }
+}
+
+#endif
@@ -5,6 +5,7 @@
#include <linux/sched.h>
#include <linux/vtime.h>
#include <linux/context_tracking_state.h>
+#include <linux/context_tracking_work.h>
#include <linux/instrumentation.h>
#include <asm/ptrace.h>
@@ -137,6 +138,26 @@ static __always_inline unsigned long ct_state_inc(int incby)
return raw_atomic_add_return(incby, this_cpu_ptr(&context_tracking.state));
}
+#ifdef CONFIG_CONTEXT_TRACKING_WORK
+static __always_inline unsigned long ct_state_inc_clear_work(int incby)
+{
+ struct context_tracking *ct = this_cpu_ptr(&context_tracking);
+ unsigned long new, old, state;
+
+ state = arch_atomic_read(&ct->state);
+ do {
+ old = state;
+ new = old & ~CT_WORK_MASK;
+ new += incby;
+ state = arch_atomic_cmpxchg(&ct->state, old, new);
+ } while (old != state);
+
+ return new;
+}
+#else
+#define ct_state_inc_clear_work(x) ct_state_inc(x)
+#endif
+
static __always_inline bool warn_rcu_enter(void)
{
bool ret = false;
@@ -5,6 +5,7 @@
#include <linux/percpu.h>
#include <linux/static_key.h>
#include <linux/context_tracking_irq.h>
+#include <linux/context_tracking_work.h>
/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
#define CT_NESTING_IRQ_NONIDLE ((LONG_MAX / 2) + 1)
@@ -39,16 +40,19 @@ struct context_tracking {
};
/*
- * We cram two different things within the same atomic variable:
+ * We cram up to three different things within the same atomic variable:
*
- * CT_RCU_WATCHING_START CT_STATE_START
- * | |
- * v v
- * MSB [ RCU watching counter ][ context_state ] LSB
- * ^ ^
- * | |
- * CT_RCU_WATCHING_END CT_STATE_END
+ * CT_RCU_WATCHING_START CT_STATE_START
+ * | CT_WORK_START |
+ * | | |
+ * v v v
+ * MSB [ RCU watching counter ][ context work ][ context_state ] LSB
+ * ^ ^ ^
+ * | | |
+ * | CT_WORK_END |
+ * CT_RCU_WATCHING_END CT_STATE_END
*
+ * The [ context work ] region spans 0 bits if CONFIG_CONTEXT_WORK=n
* Bits are used from the LSB upwards, so unused bits (if any) will always be in
* upper bits of the variable.
*/
@@ -59,18 +63,24 @@ struct context_tracking {
#define CT_STATE_START 0
#define CT_STATE_END (CT_STATE_START + CT_STATE_WIDTH - 1)
-#define CT_RCU_WATCHING_MAX_WIDTH (CT_SIZE - CT_STATE_WIDTH)
+#define CT_WORK_WIDTH (IS_ENABLED(CONFIG_CONTEXT_TRACKING_WORK) ? CT_WORK_MAX_OFFSET : 0)
+#define CT_WORK_START (CT_STATE_END + 1)
+#define CT_WORK_END (CT_WORK_START + CT_WORK_WIDTH - 1)
+
+#define CT_RCU_WATCHING_MAX_WIDTH (CT_SIZE - CT_WORK_WIDTH - CT_STATE_WIDTH)
#define CT_RCU_WATCHING_WIDTH (IS_ENABLED(CONFIG_RCU_DYNTICKS_TORTURE) ? 2 : CT_RCU_WATCHING_MAX_WIDTH)
-#define CT_RCU_WATCHING_START (CT_STATE_END + 1)
+#define CT_RCU_WATCHING_START (CT_WORK_END + 1)
#define CT_RCU_WATCHING_END (CT_RCU_WATCHING_START + CT_RCU_WATCHING_WIDTH - 1)
#define CT_RCU_WATCHING BIT(CT_RCU_WATCHING_START)
#define CT_STATE_MASK GENMASK(CT_STATE_END, CT_STATE_START)
+#define CT_WORK_MASK GENMASK(CT_WORK_END, CT_WORK_START)
#define CT_RCU_WATCHING_MASK GENMASK(CT_RCU_WATCHING_END, CT_RCU_WATCHING_START)
#define CT_UNUSED_WIDTH (CT_RCU_WATCHING_MAX_WIDTH - CT_RCU_WATCHING_WIDTH)
static_assert(CT_STATE_WIDTH +
+ CT_WORK_WIDTH +
CT_RCU_WATCHING_WIDTH +
CT_UNUSED_WIDTH ==
CT_SIZE);
new file mode 100644
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CONTEXT_TRACKING_WORK_H
+#define _LINUX_CONTEXT_TRACKING_WORK_H
+
+#include <linux/bitops.h>
+
+enum {
+ CT_WORK_n_OFFSET,
+ CT_WORK_MAX_OFFSET
+};
+
+enum ct_work {
+ CT_WORK_n = BIT(CT_WORK_n_OFFSET),
+ CT_WORK_MAX = BIT(CT_WORK_MAX_OFFSET)
+};
+
+#include <asm/context_tracking_work.h>
+
+#ifdef CONFIG_CONTEXT_TRACKING_WORK
+extern bool ct_set_cpu_work(unsigned int cpu, enum ct_work work);
+#else
+static inline bool
+ct_set_cpu_work(unsigned int cpu, unsigned int work) { return false; }
+#endif
+
+#endif
@@ -72,6 +72,67 @@ static __always_inline void rcu_task_trace_heavyweight_exit(void)
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
+#ifdef CONFIG_CONTEXT_TRACKING_WORK
+static noinstr void ct_work_flush(unsigned long seq)
+{
+ int bit;
+
+ seq = (seq & CT_WORK_MASK) >> CT_WORK_START;
+
+ /*
+ * arch_context_tracking_work() must be noinstr, non-blocking,
+ * and NMI safe.
+ */
+ for_each_set_bit(bit, &seq, CT_WORK_MAX)
+ arch_context_tracking_work(BIT(bit));
+}
+
+/**
+ * ct_set_cpu_work - set work to be run at next kernel context entry
+ *
+ * If @cpu is not currently executing in kernelspace, it will execute the
+ * callback mapped to @work (see arch_context_tracking_work()) at its next
+ * transition to CT_KERNEL_STATE.
+ *
+ * If it is already in CT_KERNEL_STATE, this will be a no-op.
+ */
+bool ct_set_cpu_work(unsigned int cpu, enum ct_work work)
+{
+ struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
+ unsigned int old;
+ bool ret = false;
+
+ if (!ct->active)
+ return false;
+
+ preempt_disable();
+
+ old = atomic_read(&ct->state);
+
+ /*
+ * We only want to set the work bit if the target CPU is not in
+ * kernelspace, so we clear the KERNEL bit here and let the cmpxchg do
+ * the check for us - the state could change between the atomic_read() and
+ * the cmpxchg().
+ */
+ old &= ~CT_STATE_KERNEL;
+ /*
+ * Try setting the work until either
+ * - the target CPU has entered kernelspace
+ * - the work has been set
+ */
+ do {
+ ret = atomic_try_cmpxchg(&ct->state, &old, old | (work << CT_WORK_START));
+ } while (!ret && ((old & CT_STATE_MASK) != CT_STATE_KERNEL));
+
+ preempt_enable();
+ return ret;
+}
+#else
+static __always_inline void ct_work_flush(unsigned long work) { }
+static __always_inline void ct_work_clear(struct context_tracking *ct) { }
+#endif
+
/*
* Record entry into an extended quiescent state. This is only to be
* called when not already in an extended quiescent state, that is,
@@ -88,7 +149,7 @@ static noinstr void ct_kernel_exit_state(int offset)
* next idle sojourn.
*/
rcu_task_trace_heavyweight_enter(); // Before CT state update!
- seq = ct_state_inc(offset);
+ seq = ct_state_inc_clear_work(offset);
// RCU is no longer watching. Better be in extended quiescent state!
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & CT_RCU_WATCHING));
}
@@ -100,7 +161,7 @@ static noinstr void ct_kernel_exit_state(int offset)
*/
static noinstr void ct_kernel_enter_state(int offset)
{
- int seq;
+ unsigned long seq;
/*
* CPUs seeing atomic_add_return() must see prior idle sojourns,
@@ -108,6 +169,7 @@ static noinstr void ct_kernel_enter_state(int offset)
* critical section.
*/
seq = ct_state_inc(offset);
+ ct_work_flush(seq);
// RCU is now watching. Better not be in an extended quiescent state!
rcu_task_trace_heavyweight_exit(); // After CT state update!
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & CT_RCU_WATCHING));
@@ -181,6 +181,11 @@ config CONTEXT_TRACKING_USER_FORCE
Say N otherwise, this option brings an overhead that you
don't want in production.
+config CONTEXT_TRACKING_WORK
+ bool
+ depends on HAVE_CONTEXT_TRACKING_WORK && CONTEXT_TRACKING_USER
+ default y
+
config NO_HZ
bool "Old Idle dynticks config"
help