Message ID | 20230705181256.3539027-12-vschneid@redhat.com (mailing list archive) |
---|---|
State | RFC |
Headers | show |
Series | context_tracking,x86: Defer some IPIs until a user->kernel transition | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Not a local patch |
Le Wed, Jul 05, 2023 at 07:12:53PM +0100, Valentin Schneider a écrit : > +bool ct_set_cpu_work(unsigned int cpu, unsigned int work) > +{ > + struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); > + unsigned int old_work; > + bool ret = false; > + > + preempt_disable(); > + > + old_work = atomic_read(&ct->work); > + /* > + * Try setting the work until either > + * - the target CPU no longer accepts any more deferred work > + * - the work has been set > + */ > + while (!(old_work & CONTEXT_WORK_DISABLED) && !ret) Isn't there a race here where you may have missed a CPU that just entered in user and you eventually disturb it? > + ret = atomic_try_cmpxchg(&ct->work, &old_work, old_work | work); > + > + preempt_enable(); > + return ret; > +} [...] > @@ -100,14 +158,19 @@ static noinstr void ct_kernel_exit_state(int offset) > */ > static noinstr void ct_kernel_enter_state(int offset) > { > + struct context_tracking *ct = this_cpu_ptr(&context_tracking); > int seq; > + unsigned int work; > > + work = ct_work_fetch(ct); So this adds another fully ordered operation on user <-> kernel transition. How many such IPIs can we expect? If this is just about a dozen, can we stuff them in the state like in the following? We can potentially add more of them especially on 64 bits we could afford 30 different works, this is just shrinking the RCU extended quiescent state counter space. Worst case that can happen is that RCU misses 65535 idle/user <-> kernel transitions and delays a grace period... diff --git a/arch/Kconfig b/arch/Kconfig index 205fd23e0cad..e453e9fb864b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -851,6 +851,15 @@ config HAVE_CONTEXT_TRACKING_USER_OFFSTACK - No use of instrumentation, unless instrumentation_begin() got called. +config HAVE_CONTEXT_TRACKING_WORK + bool + help + Architecture supports deferring work while not in kernel context. + This is especially useful on setups with isolated CPUs that might + want to avoid being interrupted to perform housekeeping tasks (for + ex. TLB invalidation or icache invalidation). The housekeeping + operations are performed upon re-entering the kernel. + config HAVE_TIF_NOHZ bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 53bab123a8ee..490c773105c0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -197,6 +197,7 @@ config X86 select HAVE_CMPXCHG_LOCAL select HAVE_CONTEXT_TRACKING_USER if X86_64 select HAVE_CONTEXT_TRACKING_USER_OFFSTACK if HAVE_CONTEXT_TRACKING_USER + select HAVE_CONTEXT_TRACKING_WORK if X86_64 select HAVE_C_RECORDMCOUNT select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL select HAVE_OBJTOOL_NOP_MCOUNT if HAVE_OBJTOOL_MCOUNT diff --git a/arch/x86/include/asm/context_tracking_work.h b/arch/x86/include/asm/context_tracking_work.h new file mode 100644 index 000000000000..5bc29e6b2ed3 --- /dev/null +++ b/arch/x86/include/asm/context_tracking_work.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_CONTEXT_TRACKING_WORK_H +#define _ASM_X86_CONTEXT_TRACKING_WORK_H + +static __always_inline void arch_context_tracking_work(int work) +{ + switch (work) { + case CONTEXT_WORK_n: + // Do work... + break; + } +} + +#endif diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index d3cbb6c16bab..333b26d7cbe5 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -5,6 +5,7 @@ #include <linux/sched.h> #include <linux/vtime.h> #include <linux/context_tracking_state.h> +#include <linux/context_tracking_work.h> #include <linux/instrumentation.h> #include <asm/ptrace.h> @@ -75,7 +76,7 @@ static inline void exception_exit(enum ctx_state prev_ctx) static __always_inline bool context_tracking_guest_enter(void) { if (context_tracking_enabled()) - __ct_user_enter(CONTEXT_GUEST); + __ct_user_enter(CONTEXT_USER); return context_tracking_enabled_this_cpu(); } @@ -83,7 +84,7 @@ static __always_inline bool context_tracking_guest_enter(void) static __always_inline void context_tracking_guest_exit(void) { if (context_tracking_enabled()) - __ct_user_exit(CONTEXT_GUEST); + __ct_user_exit(CONTEXT_USER); } #define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond)) @@ -122,6 +123,26 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void) return !(arch_atomic_read(this_cpu_ptr(&context_tracking.state)) & RCU_DYNTICKS_IDX); } +/* + * Increment the current CPU's context_tracking structure's ->state field + * with ordering and clear the work bits. Return the new value. + */ +static __always_inline unsigned long ct_state_inc_clear_work(int incby) +{ + struct context_tracking *ct = this_cpu_ptr(&context_tracking); + unsigned long new, old, state; + + state = arch_atomic_read(&ct->state); + do { + old = state; + new = old & ~CONTEXT_WORK_MASK; + new += incby; + state = arch_atomic_cmpxchg(&ct->state, old, new); + } while (old != state); + + return state; +} + /* * Increment the current CPU's context_tracking structure's ->state field * with ordering. Return the new value. diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index fdd537ea513f..ec3d172601c5 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -10,14 +10,19 @@ #define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) enum ctx_state { + /* Following are values */ CONTEXT_DISABLED = -1, /* returned by ct_state() if unknown */ CONTEXT_KERNEL = 0, CONTEXT_IDLE = 1, CONTEXT_USER = 2, - CONTEXT_GUEST = 3, - CONTEXT_MAX = 4, + /* Following are bit numbers */ + CONTEXT_WORK = 2, + CONTEXT_MAX = 16, }; +#define CONTEXT_MASK (BIT(CONTEXT_WORK) - 1) +#define CONTEXT_WORK_MASK ((BIT(CONTEXT_MAX) - 1) & ~(BIT(CONTEXT_WORK) - 1)) + /* Even value for idle, else odd. */ #define RCU_DYNTICKS_IDX CONTEXT_MAX diff --git a/include/linux/context_tracking_work.h b/include/linux/context_tracking_work.h new file mode 100644 index 000000000000..fb74db8876dd --- /dev/null +++ b/include/linux/context_tracking_work.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CONTEXT_TRACKING_WORK_H +#define _LINUX_CONTEXT_TRACKING_WORK_H + +#include <linux/bitops.h> + +enum { + CONTEXT_WORK_n_OFFSET, + CONTEXT_WORK_MAX_OFFSET +}; + +enum ct_work { + CONTEXT_WORK_n = BIT(CONTEXT_WORK_n_OFFSET), + CONTEXT_WORK_MAX = BIT(CONTEXT_WORK_MAX_OFFSET) +}; + +#include <asm/context_tracking_work.h> + +#ifdef CONFIG_CONTEXT_TRACKING_WORK +extern bool ct_set_cpu_work(unsigned int cpu, unsigned int work); +#else +static inline bool +ct_set_cpu_work(unsigned int cpu, unsigned int work) { return false; } +#endif + +#endif diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index a09f1c19336a..732042b9a7b7 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -72,6 +72,58 @@ static __always_inline void rcu_dynticks_task_trace_exit(void) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } +#ifdef CONFIG_CONTEXT_TRACKING_WORK +static noinstr void ct_work_flush(unsigned long seq) +{ + unsigned int bit; + /* + * arch_context_tracking_work() must be noinstr, non-blocking, + * and NMI safe. + */ + for_each_set_bit(bit, &seq, CONTEXT_MAX) + arch_context_tracking_work(BIT(bit) >> CONTEXT_WORK); +} + +bool ct_set_cpu_work(unsigned int cpu, unsigned int work) +{ + struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); + unsigned int old, new, state; + bool ret = false; + + preempt_disable(); + + work <<= CONTEXT_WORK; + state = atomic_read(&ct->state); + /* + * Try setting the work until either + * - the target CPU is on the kernel + * - the work has been set + */ + for (;;) { + /* Only set if running in user/guest */ + old = state; + old &= ~CONTEXT_MASK; + old |= CONTEXT_USER; + + new = old | work; + + state = atomic_cmpxchg(&ct->state, old, new); + if (state & work) { + ret = true; + break; + } + + if ((state & CONTEXT_MASK) != CONTEXT_USER) + break; + } + + preempt_enable(); + return ret; +} +#else +static __always_inline void ct_work_flush(unsigned long seq) { } +#endif + /* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state, that is, @@ -100,14 +152,18 @@ static noinstr void ct_kernel_exit_state(int offset) */ static noinstr void ct_kernel_enter_state(int offset) { - int seq; + struct context_tracking *ct = this_cpu_ptr(&context_tracking); + unsigned long seq; /* * CPUs seeing atomic_add_return() must see prior idle sojourns, * and we also must force ordering with the next RCU read-side * critical section. */ - seq = ct_state_inc(offset); + seq = ct_state_inc_clear_work(offset); + if (seq & CONTEXT_WORK_MASK) + ct_work_flush(seq & CONTEXT_WORK_MASK); + // RCU is now watching. Better not be in an extended quiescent state! rcu_dynticks_task_trace_exit(); // After ->dynticks update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX)); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index bae8f11070be..fdb266f2d774 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -181,6 +181,11 @@ config CONTEXT_TRACKING_USER_FORCE Say N otherwise, this option brings an overhead that you don't want in production. +config CONTEXT_TRACKING_WORK + bool + depends on HAVE_CONTEXT_TRACKING_WORK && CONTEXT_TRACKING_USER + default y + config NO_HZ bool "Old Idle dynticks config" help
On Wed, Jul 05, 2023 at 07:12:53PM +0100, Valentin Schneider wrote: > Note: A previous approach by PeterZ [1] used an extra bit in > context_tracking.state to flag the presence of deferred callbacks to > execute, and the actual callbacks were stored in a separate atomic > variable. > > This meant that the atomic read of context_tracking.state was sufficient to > determine whether there are any deferred callbacks to execute. > Unfortunately, it presents a race window. Consider the work setting > function as: > > preempt_disable(); > seq = atomic_read(&ct->seq); > if (__context_tracking_seq_in_user(seq)) { > /* ctrl-dep */ > atomic_or(work, &ct->work); > ret = atomic_try_cmpxchg(&ct->seq, &seq, seq|CT_SEQ_WORK); > } > preempt_enable(); > > return ret; > > Then the following can happen: > > CPUx CPUy > CT_SEQ_WORK \in context_tracking.state > atomic_or(WORK_N, &ct->work); > ct_kernel_enter() > ct_state_inc(); > atomic_try_cmpxchg(&ct->seq, &seq, seq|CT_SEQ_WORK); > > The cmpxchg() would fail, ultimately causing an IPI for WORK_N to be > sent. Unfortunately, the work bit would remain set, and it can't be sanely > cleared in case another CPU set it concurrently - this would ultimately > lead to a double execution of the callback, one as a deferred callback and > one in the IPI. As not all IPI callbacks are idempotent, this is > undesirable. So adding another atomic is arguably worse. The thing is, if the NOHZ_FULL CPU is actually doing context transitions (SYSCALLs etc..) then everything is fundamentally racy, there is no winning that game, we could find the remote CPU is in-kernel, send an IPI, the remote CPU does return-to-user and receives the IPI. And then the USER is upset... because he got an IPI. The whole NOHZ_FULL thing really only works if userspace does not do SYSCALLs. But the sad sad state of affairs is that some people think it is acceptable to do SYSCALLs while NOHZ_FULL and cry about how slow stuff is.
On Thu, Jul 06, 2023 at 12:23:57AM +0200, Frederic Weisbecker wrote: > If this is just about a dozen, can we stuff them in the state like in the > following? We can potentially add more of them especially on 64 bits we could > afford 30 different works, this is just shrinking the RCU extended quiescent > state counter space. Worst case that can happen is that RCU misses 65535 > idle/user <-> kernel transitions and delays a grace period... We can make all this a 64bit only feature and use atomic_long_t :-)
On Thu, Jul 06, 2023 at 12:41:04AM +0200, Peter Zijlstra wrote: > On Thu, Jul 06, 2023 at 12:23:57AM +0200, Frederic Weisbecker wrote: > > If this is just about a dozen, can we stuff them in the state like in the > > following? We can potentially add more of them especially on 64 bits we could > > afford 30 different works, this is just shrinking the RCU extended quiescent > > state counter space. Worst case that can happen is that RCU misses 65535 > > idle/user <-> kernel transitions and delays a grace period... > > We can make all this a 64bit only feature and use atomic_long_t :-) Works for me :)
On Thu, Jul 06, 2023 at 12:23:57AM +0200, Frederic Weisbecker wrote: > diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h > index fdd537ea513f..ec3d172601c5 100644 > --- a/include/linux/context_tracking_state.h > +++ b/include/linux/context_tracking_state.h > @@ -10,14 +10,19 @@ > #define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) > > enum ctx_state { > + /* Following are values */ > CONTEXT_DISABLED = -1, /* returned by ct_state() if unknown */ > CONTEXT_KERNEL = 0, > CONTEXT_IDLE = 1, > CONTEXT_USER = 2, > - CONTEXT_GUEST = 3, > - CONTEXT_MAX = 4, > + /* Following are bit numbers */ > + CONTEXT_WORK = 2, > + CONTEXT_MAX = 16, > }; > > +#define CONTEXT_MASK (BIT(CONTEXT_WORK) - 1) > +#define CONTEXT_WORK_MASK ((BIT(CONTEXT_MAX) - 1) & ~(BIT(CONTEXT_WORK) - 1)) > + > /* Even value for idle, else odd. */ > #define RCU_DYNTICKS_IDX CONTEXT_MAX And that should be: #define RCU_DYNTICKS_IDX BIT(CONTEXT_MAX) Did I mention it's not even build tested? :o)
On 06/07/23 00:23, Frederic Weisbecker wrote: > Le Wed, Jul 05, 2023 at 07:12:53PM +0100, Valentin Schneider a écrit : >> +bool ct_set_cpu_work(unsigned int cpu, unsigned int work) >> +{ >> + struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); >> + unsigned int old_work; >> + bool ret = false; >> + >> + preempt_disable(); >> + >> + old_work = atomic_read(&ct->work); >> + /* >> + * Try setting the work until either >> + * - the target CPU no longer accepts any more deferred work >> + * - the work has been set >> + */ >> + while (!(old_work & CONTEXT_WORK_DISABLED) && !ret) > > Isn't there a race here where you may have missed a CPU that just entered in > user and you eventually disturb it? > Yes, unfortunately. >> + ret = atomic_try_cmpxchg(&ct->work, &old_work, old_work | work); >> + >> + preempt_enable(); >> + return ret; >> +} > [...] >> @@ -100,14 +158,19 @@ static noinstr void ct_kernel_exit_state(int offset) >> */ >> static noinstr void ct_kernel_enter_state(int offset) >> { >> + struct context_tracking *ct = this_cpu_ptr(&context_tracking); >> int seq; >> + unsigned int work; >> >> + work = ct_work_fetch(ct); > > So this adds another fully ordered operation on user <-> kernel transition. > How many such IPIs can we expect? > Despite having spent quite a lot of time on that question, I think I still only have a hunch. Poking around RHEL systems, I'd say 99% of the problematic IPIs are instruction patching and TLB flushes. Staring at the code, there's quite a lot of smp_calls for which it's hard to say whether the target CPUs can actually be isolated or not (e.g. the CPU comes from a cpumask shoved in a struct that was built using data from another struct of uncertain origins), but then again some of them don't need to hook into context_tracking. Long story short: I /think/ we can consider that number to be fairly small, but there could be more lurking in the shadows. > If this is just about a dozen, can we stuff them in the state like in the > following? We can potentially add more of them especially on 64 bits we could > afford 30 different works, this is just shrinking the RCU extended quiescent > state counter space. Worst case that can happen is that RCU misses 65535 > idle/user <-> kernel transitions and delays a grace period... > I'm trying to grok how this impacts RCU, IIUC most of RCU mostly cares about the even/odd-ness of the thing, and rcu_gp_fqs() cares about the actual value but only to check if it has changed over time (rcu_dynticks_in_eqs_since() only does a !=). I'm rephrasing here to make sure I get it - is it then that the worst case here is 2^(dynticks_counter_size) transitions happen between saving the dynticks snapshot and checking it again, so RCU waits some more?
On 06/07/23 00:39, Peter Zijlstra wrote: > On Wed, Jul 05, 2023 at 07:12:53PM +0100, Valentin Schneider wrote: > >> Note: A previous approach by PeterZ [1] used an extra bit in >> context_tracking.state to flag the presence of deferred callbacks to >> execute, and the actual callbacks were stored in a separate atomic >> variable. >> >> This meant that the atomic read of context_tracking.state was sufficient to >> determine whether there are any deferred callbacks to execute. >> Unfortunately, it presents a race window. Consider the work setting >> function as: >> >> preempt_disable(); >> seq = atomic_read(&ct->seq); >> if (__context_tracking_seq_in_user(seq)) { >> /* ctrl-dep */ >> atomic_or(work, &ct->work); >> ret = atomic_try_cmpxchg(&ct->seq, &seq, seq|CT_SEQ_WORK); >> } >> preempt_enable(); >> >> return ret; >> >> Then the following can happen: >> >> CPUx CPUy >> CT_SEQ_WORK \in context_tracking.state >> atomic_or(WORK_N, &ct->work); >> ct_kernel_enter() >> ct_state_inc(); >> atomic_try_cmpxchg(&ct->seq, &seq, seq|CT_SEQ_WORK); >> >> The cmpxchg() would fail, ultimately causing an IPI for WORK_N to be >> sent. Unfortunately, the work bit would remain set, and it can't be sanely >> cleared in case another CPU set it concurrently - this would ultimately >> lead to a double execution of the callback, one as a deferred callback and >> one in the IPI. As not all IPI callbacks are idempotent, this is >> undesirable. > > So adding another atomic is arguably worse. > > The thing is, if the NOHZ_FULL CPU is actually doing context transitions > (SYSCALLs etc..) then everything is fundamentally racy, there is no > winning that game, we could find the remote CPU is in-kernel, send an > IPI, the remote CPU does return-to-user and receives the IPI. > > And then the USER is upset... because he got an IPI. > Yeah, that part is inevitably racy. The thing I was especially worried about was the potential double executions (once in IPI, again in deferred work). It's not /too/ bad as the only two deferred callbacks I'm introducing here are costly-but-stateless, but IMO is a bad foundation. But it seems like we can reuse the existing atomic and squeeze some bits in there, so let's see how that goes :-)
On Thu, Jul 06, 2023 at 12:30:46PM +0100, Valentin Schneider wrote: > >> + ret = atomic_try_cmpxchg(&ct->work, &old_work, old_work | work); > >> + > >> + preempt_enable(); > >> + return ret; > >> +} > > [...] > >> @@ -100,14 +158,19 @@ static noinstr void ct_kernel_exit_state(int offset) > >> */ > >> static noinstr void ct_kernel_enter_state(int offset) > >> { > >> + struct context_tracking *ct = this_cpu_ptr(&context_tracking); > >> int seq; > >> + unsigned int work; > >> > >> + work = ct_work_fetch(ct); > > > > So this adds another fully ordered operation on user <-> kernel transition. > > How many such IPIs can we expect? > > > > Despite having spent quite a lot of time on that question, I think I still > only have a hunch. > > Poking around RHEL systems, I'd say 99% of the problematic IPIs are > instruction patching and TLB flushes. > > Staring at the code, there's quite a lot of smp_calls for which it's hard > to say whether the target CPUs can actually be isolated or not (e.g. the > CPU comes from a cpumask shoved in a struct that was built using data from > another struct of uncertain origins), but then again some of them don't > need to hook into context_tracking. > > Long story short: I /think/ we can consider that number to be fairly small, > but there could be more lurking in the shadows. I guess it will still be time to reconsider the design if we ever reach such size. > > > If this is just about a dozen, can we stuff them in the state like in the > > following? We can potentially add more of them especially on 64 bits we could > > afford 30 different works, this is just shrinking the RCU extended quiescent > > state counter space. Worst case that can happen is that RCU misses 65535 > > idle/user <-> kernel transitions and delays a grace period... > > > > I'm trying to grok how this impacts RCU, IIUC most of RCU mostly cares about the > even/odd-ness of the thing, and rcu_gp_fqs() cares about the actual value > but only to check if it has changed over time (rcu_dynticks_in_eqs_since() > only does a !=). > > I'm rephrasing here to make sure I get it - is it then that the worst case > here is 2^(dynticks_counter_size) transitions happen between saving the > dynticks snapshot and checking it again, so RCU waits some more? That's my understanding as well but I have to defer on Paul to make sure I'm not overlooking something. Thanks.
On Thu, Jul 06, 2023 at 12:23:57AM +0200, Frederic Weisbecker wrote: > Le Wed, Jul 05, 2023 at 07:12:53PM +0100, Valentin Schneider a écrit : > +bool ct_set_cpu_work(unsigned int cpu, unsigned int work) > +{ > + struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); > + unsigned int old, new, state; > + bool ret = false; > + > + preempt_disable(); > + > + work <<= CONTEXT_WORK; > + state = atomic_read(&ct->state); > + /* > + * Try setting the work until either > + * - the target CPU is on the kernel > + * - the work has been set > + */ > + for (;;) { > + /* Only set if running in user/guest */ > + old = state; > + old &= ~CONTEXT_MASK; > + old |= CONTEXT_USER; > + > + new = old | work; > + > + state = atomic_cmpxchg(&ct->state, old, new); > + if (state & work) { And this should be "if (state == old)", otherwise there is a risk that someone else had set the work but atomic_cmpxchg() failed due to other modifications is the meantime. It's then dangerous in that case to defer the work because atomic_cmpxchg() failures don't imply full ordering. So there is a risk that the target executes the work but doesn't see the most recent data. Thanks.
On Thu, Jul 06, 2023 at 01:40:14PM +0200, Frederic Weisbecker wrote: > On Thu, Jul 06, 2023 at 12:30:46PM +0100, Valentin Schneider wrote: > > >> + ret = atomic_try_cmpxchg(&ct->work, &old_work, old_work | work); > > >> + > > >> + preempt_enable(); > > >> + return ret; > > >> +} > > > [...] > > >> @@ -100,14 +158,19 @@ static noinstr void ct_kernel_exit_state(int offset) > > >> */ > > >> static noinstr void ct_kernel_enter_state(int offset) > > >> { > > >> + struct context_tracking *ct = this_cpu_ptr(&context_tracking); > > >> int seq; > > >> + unsigned int work; > > >> > > >> + work = ct_work_fetch(ct); > > > > > > So this adds another fully ordered operation on user <-> kernel transition. > > > How many such IPIs can we expect? > > > > > > > Despite having spent quite a lot of time on that question, I think I still > > only have a hunch. > > > > Poking around RHEL systems, I'd say 99% of the problematic IPIs are > > instruction patching and TLB flushes. > > > > Staring at the code, there's quite a lot of smp_calls for which it's hard > > to say whether the target CPUs can actually be isolated or not (e.g. the > > CPU comes from a cpumask shoved in a struct that was built using data from > > another struct of uncertain origins), but then again some of them don't > > need to hook into context_tracking. > > > > Long story short: I /think/ we can consider that number to be fairly small, > > but there could be more lurking in the shadows. > > I guess it will still be time to reconsider the design if we ever reach such size. > > > > If this is just about a dozen, can we stuff them in the state like in the > > > following? We can potentially add more of them especially on 64 bits we could > > > afford 30 different works, this is just shrinking the RCU extended quiescent > > > state counter space. Worst case that can happen is that RCU misses 65535 > > > idle/user <-> kernel transitions and delays a grace period... > > > > > > > I'm trying to grok how this impacts RCU, IIUC most of RCU mostly cares about the > > even/odd-ness of the thing, and rcu_gp_fqs() cares about the actual value > > but only to check if it has changed over time (rcu_dynticks_in_eqs_since() > > only does a !=). > > > > I'm rephrasing here to make sure I get it - is it then that the worst case > > here is 2^(dynticks_counter_size) transitions happen between saving the > > dynticks snapshot and checking it again, so RCU waits some more? > > That's my understanding as well but I have to defer on Paul to make sure I'm > not overlooking something. That does look plausible to me. And yes, RCU really cares about whether its part of this counter has been a multiple of two during a given interval of time, because this indicates that the CPU has no pre-existing RCU readers still active. One way that this can happen is for that value to be a multiple of two at some point in time. The other way that this can happen is for the value to have changed. No matter what the start and end values, if they are different, the counter must necessarily have at least passed through multiple of two in the meantime, again guaranteeing that any RCU readers that around when the count was first fetched have now finished. But we should take the machine's opinions much more seriously than we take any of our own opinions. Why not adjust RCU_DYNTICKS_IDX so as to crank RCU's portion of this counter down to (say) two or three bits and let rcutorture have at it on TREE04 or TREE07, both of which have nohz_full CPUs? Maybe also adjust mkinitrd.sh to make the user/kernel transitions more frequent? Please note that I do -not- recommend production use of a three-bit (let alone a two-bit) RCU portion because this has a high probability of excessively extending grace periods. But it might be good to keep a tiny counter as a debug option so that we regularly rcutorture it. Thanx, Paul
On 06/07/23 09:39, Paul E. McKenney wrote: > On Thu, Jul 06, 2023 at 01:40:14PM +0200, Frederic Weisbecker wrote: >> On Thu, Jul 06, 2023 at 12:30:46PM +0100, Valentin Schneider wrote: >> > I'm trying to grok how this impacts RCU, IIUC most of RCU mostly cares about the >> > even/odd-ness of the thing, and rcu_gp_fqs() cares about the actual value >> > but only to check if it has changed over time (rcu_dynticks_in_eqs_since() >> > only does a !=). >> > >> > I'm rephrasing here to make sure I get it - is it then that the worst case >> > here is 2^(dynticks_counter_size) transitions happen between saving the >> > dynticks snapshot and checking it again, so RCU waits some more? >> >> That's my understanding as well but I have to defer on Paul to make sure I'm >> not overlooking something. > > That does look plausible to me. > > And yes, RCU really cares about whether its part of this counter has > been a multiple of two during a given interval of time, because this > indicates that the CPU has no pre-existing RCU readers still active. > One way that this can happen is for that value to be a multiple of two > at some point in time. The other way that this can happen is for the > value to have changed. No matter what the start and end values, if they > are different, the counter must necessarily have at least passed through > multiple of two in the meantime, again guaranteeing that any RCU readers > that around when the count was first fetched have now finished. > Thank you for the demystification! > But we should take the machine's opinions much more seriously than we > take any of our own opinions. Heh :-) > Why not adjust RCU_DYNTICKS_IDX so as > to crank RCU's portion of this counter down to (say) two or three bits > and let rcutorture have at it on TREE04 or TREE07, both of which have > nohz_full CPUs? > > Maybe also adjust mkinitrd.sh to make the user/kernel transitions more > frequent? > > Please note that I do -not- recommend production use of a three-bit > (let alone a two-bit) RCU portion because this has a high probability > of excessively extending grace periods. But it might be good to keep > a tiny counter as a debug option so that we regularly rcutorture it. > Sounds sensible, I'll add that to my v2 todolist. Thanks! > Thanx, Paul
diff --git a/arch/Kconfig b/arch/Kconfig index 205fd23e0cada..e453e9fb864be 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -851,6 +851,15 @@ config HAVE_CONTEXT_TRACKING_USER_OFFSTACK - No use of instrumentation, unless instrumentation_begin() got called. +config HAVE_CONTEXT_TRACKING_WORK + bool + help + Architecture supports deferring work while not in kernel context. + This is especially useful on setups with isolated CPUs that might + want to avoid being interrupted to perform housekeeping tasks (for + ex. TLB invalidation or icache invalidation). The housekeeping + operations are performed upon re-entering the kernel. + config HAVE_TIF_NOHZ bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 53bab123a8ee4..490c773105c0c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -197,6 +197,7 @@ config X86 select HAVE_CMPXCHG_LOCAL select HAVE_CONTEXT_TRACKING_USER if X86_64 select HAVE_CONTEXT_TRACKING_USER_OFFSTACK if HAVE_CONTEXT_TRACKING_USER + select HAVE_CONTEXT_TRACKING_WORK if X86_64 select HAVE_C_RECORDMCOUNT select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL select HAVE_OBJTOOL_NOP_MCOUNT if HAVE_OBJTOOL_MCOUNT diff --git a/arch/x86/include/asm/context_tracking_work.h b/arch/x86/include/asm/context_tracking_work.h new file mode 100644 index 0000000000000..5bc29e6b2ed38 --- /dev/null +++ b/arch/x86/include/asm/context_tracking_work.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_CONTEXT_TRACKING_WORK_H +#define _ASM_X86_CONTEXT_TRACKING_WORK_H + +static __always_inline void arch_context_tracking_work(int work) +{ + switch (work) { + case CONTEXT_WORK_n: + // Do work... + break; + } +} + +#endif diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index d3cbb6c16babf..80d571ddfc3a4 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -5,6 +5,7 @@ #include <linux/sched.h> #include <linux/vtime.h> #include <linux/context_tracking_state.h> +#include <linux/context_tracking_work.h> #include <linux/instrumentation.h> #include <asm/ptrace.h> diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index fdd537ea513ff..5af06ed26f858 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -36,6 +36,7 @@ struct context_tracking { int recursion; #endif #ifdef CONFIG_CONTEXT_TRACKING + atomic_t work; atomic_t state; #endif #ifdef CONFIG_CONTEXT_TRACKING_IDLE diff --git a/include/linux/context_tracking_work.h b/include/linux/context_tracking_work.h new file mode 100644 index 0000000000000..0b06c3dab58c7 --- /dev/null +++ b/include/linux/context_tracking_work.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CONTEXT_TRACKING_WORK_H +#define _LINUX_CONTEXT_TRACKING_WORK_H + +#include <linux/bitops.h> + +enum { + CONTEXT_WORK_DISABLED_OFFSET, + CONTEXT_WORK_n_OFFSET, + CONTEXT_WORK_MAX_OFFSET +}; + +enum ct_work { + CONTEXT_WORK_DISABLED = BIT(CONTEXT_WORK_DISABLED_OFFSET), + CONTEXT_WORK_n = BIT(CONTEXT_WORK_n_OFFSET), + CONTEXT_WORK_MAX = BIT(CONTEXT_WORK_MAX_OFFSET) +}; + +#include <asm/context_tracking_work.h> + +#ifdef CONFIG_CONTEXT_TRACKING_WORK +extern bool ct_set_cpu_work(unsigned int cpu, unsigned int work); +#else +static inline bool +ct_set_cpu_work(unsigned int cpu, unsigned int work) { return false; } +#endif + +#endif diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 4e6cb14272fcb..b6aee3d0c0528 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -32,6 +32,9 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = { .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, #endif .state = ATOMIC_INIT(RCU_DYNTICKS_IDX), +#ifdef CONFIG_CONTEXT_TRACKING_WORK + .work = ATOMIC_INIT(CONTEXT_WORK_DISABLED), +#endif }; EXPORT_SYMBOL_GPL(context_tracking); @@ -72,6 +75,57 @@ static __always_inline void rcu_dynticks_task_trace_exit(void) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } +#ifdef CONFIG_CONTEXT_TRACKING_WORK +static __always_inline unsigned int ct_work_fetch(struct context_tracking *ct) +{ + return arch_atomic_fetch_or(CONTEXT_WORK_DISABLED, &ct->work); +} +static __always_inline void ct_work_clear(struct context_tracking *ct) +{ + arch_atomic_set(&ct->work, 0); +} + +static noinstr void ct_work_flush(unsigned long work) +{ + int bit; + + /* DISABLED is never set while there are deferred works */ + WARN_ON_ONCE(work & CONTEXT_WORK_DISABLED); + + /* + * arch_context_tracking_work() must be noinstr, non-blocking, + * and NMI safe. + */ + for_each_set_bit(bit, &work, CONTEXT_WORK_MAX) + arch_context_tracking_work(BIT(bit)); +} + +bool ct_set_cpu_work(unsigned int cpu, unsigned int work) +{ + struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu); + unsigned int old_work; + bool ret = false; + + preempt_disable(); + + old_work = atomic_read(&ct->work); + /* + * Try setting the work until either + * - the target CPU no longer accepts any more deferred work + * - the work has been set + */ + while (!(old_work & CONTEXT_WORK_DISABLED) && !ret) + ret = atomic_try_cmpxchg(&ct->work, &old_work, old_work | work); + + preempt_enable(); + return ret; +} +#else +static __always_inline void ct_work_flush(unsigned long work) { } +static __always_inline unsigned int ct_work_fetch(struct context_tracking *ct) { return 0; } +static __always_inline void ct_work_clear(struct context_tracking *ct) { } +#endif + /* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state, that is, @@ -89,6 +143,10 @@ static noinstr void ct_kernel_exit_state(int offset) */ rcu_dynticks_task_trace_enter(); // Before ->dynticks update! seq = ct_state_inc(offset); + + /* Let this CPU allow deferred callbacks again */ + ct_work_clear(this_cpu_ptr(&context_tracking)); + // RCU is no longer watching. Better be in extended quiescent state! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX)); } @@ -100,14 +158,19 @@ static noinstr void ct_kernel_exit_state(int offset) */ static noinstr void ct_kernel_enter_state(int offset) { + struct context_tracking *ct = this_cpu_ptr(&context_tracking); int seq; + unsigned int work; + work = ct_work_fetch(ct); /* * CPUs seeing atomic_add_return() must see prior idle sojourns, * and we also must force ordering with the next RCU read-side * critical section. */ seq = ct_state_inc(offset); + if (work) + ct_work_flush(work); // RCU is now watching. Better not be in an extended quiescent state! rcu_dynticks_task_trace_exit(); // After ->dynticks update! WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX)); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index bae8f11070bef..fdb266f2d774b 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -181,6 +181,11 @@ config CONTEXT_TRACKING_USER_FORCE Say N otherwise, this option brings an overhead that you don't want in production. +config CONTEXT_TRACKING_WORK + bool + depends on HAVE_CONTEXT_TRACKING_WORK && CONTEXT_TRACKING_USER + default y + config NO_HZ bool "Old Idle dynticks config" help