[22/30] sched_ext: Implement tickless support

Message ID	20230128001639.3510083-23-tj@kernel.org (mailing list archive)
State	Not Applicable
Headers	show Return-Path: <bpf-owner@vger.kernel.org> Sender: Tejun Heo <htejun@gmail.com> From: Tejun Heo <tj@kernel.org> To: torvalds@linux-foundation.org, mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, bristot@redhat.com, vschneid@redhat.com, ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org, joshdon@google.com, brho@google.com, pjt@google.com, derkling@google.com, haoluo@google.com, dvernet@meta.com, dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org, kernel-team@meta.com, Tejun Heo <tj@kernel.org> Subject: [PATCH 22/30] sched_ext: Implement tickless support Date: Fri, 27 Jan 2023 14:16:31 -1000 Message-Id: <20230128001639.3510083-23-tj@kernel.org> In-Reply-To: <20230128001639.3510083-1-tj@kernel.org> References: <20230128001639.3510083-1-tj@kernel.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	[01/30] cgroup: Implement cgroup_show_cftypes() \| expand [01/30] cgroup: Implement cgroup_show_cftypes() [02/30] sched: Encapsulate task attribute change sequence into a helper macro [03/30] sched: Restructure sched_class order sanity checks in sched_init() [04/30] sched: Allow sched_cgroup_fork() to fail and introduce sched_cancel_fork() [05/30] sched: Add sched_class->reweight_task() [06/30] sched: Add sched_class->switching_to() and expose check_class_changing/changed() [07/30] sched: Factor out cgroup weight conversion functions [08/30] sched: Expose css_tg(), __setscheduler_prio() and SCHED_CHANGE_BLOCK() [09/30] sched: Enumerate CPU cgroup file types [10/30] sched: Add @reason to sched_class->rq_{on\|off}line() [11/30] sched: Add normal_policy() [12/30] sched_ext: Add boilerplate for extensible scheduler class [14/30] sched_ext: Add scx_example_dummy and scx_example_qmap example schedulers [15/30] sched_ext: Add sysrq-S which disables the BPF scheduler [16/30] sched_ext: Implement runnable task stall watchdog [17/30] sched_ext: Allow BPF schedulers to disallow specific tasks from joining SCHED_EXT [18/30] sched_ext: Allow BPF schedulers to switch all eligible tasks into sched_ext [19/30] sched_ext: Implement scx_bpf_kick_cpu() and task preemption support [20/30] sched_ext: Make watchdog handle ops.dispatch() looping stall [21/30] sched_ext: Add task state tracking operations [22/30] sched_ext: Implement tickless support [23/30] sched_ext: Add cgroup support [24/30] sched_ext: Implement SCX_KICK_WAIT [25/30] sched_ext: Implement sched_ext_ops.cpu_acquire/release() [26/30] sched_ext: Implement sched_ext_ops.cpu_online/offline() [27/30] sched_ext: Implement core-sched support [28/30] sched_ext: Documentation: scheduler: Document extensible scheduler class [29/30] sched_ext: Add a basic, userland vruntime scheduler [30/30] sched_ext: Add a rust userspace hybrid example scheduler

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 338b41cd79fa..8f19ee7e5433 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -19,6 +19,7 @@ enum scx_consts { SCX_EXIT_MSG_LEN = 1024, SCX_SLICE_DFL = 20 * NSEC_PER_MSEC, + SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ }; /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5b68b822312b..d2419bd4fb5e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1200,13 +1200,16 @@ bool sched_can_stop_tick(struct rq *rq) return true; /* - * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; - * if there's more than one we need the tick for involuntary - * preemption. + * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks + * left. For CFS, if there's more than one we need the tick for + * involuntary preemption. For SCX, ask. */ if (!scx_switched_all() && rq->nr_running > 1) return false; + if (scx_enabled() && !scx_can_stop_tick(rq)) + return false; + return true; } #endif /* CONFIG_NO_HZ_FULL */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 4acaf39ea879..5e94f9604e23 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -447,7 +447,8 @@ static void update_curr_scx(struct rq *rq) account_group_exec_runtime(curr, delta_exec); cgroup_account_cputime(curr, delta_exec); - curr->scx.slice -= min(curr->scx.slice, delta_exec); + if (curr->scx.slice != SCX_SLICE_INF) + curr->scx.slice -= min(curr->scx.slice, delta_exec); } static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, @@ -1356,6 +1357,20 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) scx_ops.running(p); watchdog_unwatch_task(p, true); + + /* + * @p is getting newly scheduled or got kicked after someone updated its + * slice. Refresh whether tick can be stopped. See can_stop_tick_scx(). + */ + if ((p->scx.slice == SCX_SLICE_INF) != + (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { + if (p->scx.slice == SCX_SLICE_INF) + rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; + else + rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; + + sched_update_tick_dependency(rq); + } } static void put_prev_task_scx(struct rq *rq, struct task_struct *p) @@ -1891,6 +1906,26 @@ int scx_check_setscheduler(struct task_struct *p, int policy) return 0; } +#ifdef CONFIG_NO_HZ_FULL +bool scx_can_stop_tick(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + if (scx_ops_disabling()) + return false; + + if (p->sched_class != &ext_sched_class) + return true; + + /* + * @rq can dispatch from different DSQs, so we can't tell whether it + * needs the tick or not by looking at nr_running. Allow stopping ticks + * iff the BPF scheduler indicated so. See set_next_task_scx(). + */ + return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; +} +#endif + /* * Omitted operations: * @@ -2051,7 +2086,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) struct rhashtable_iter rht_iter; struct scx_dispatch_q *dsq; const char *reason; - int i, type; + int i, cpu, type; type = atomic_read(&scx_exit_type); while (true) { @@ -2148,6 +2183,10 @@ static void scx_ops_disable_workfn(struct kthread_work *work) scx_task_iter_exit(&sti); spin_unlock_irq(&scx_tasks_lock); + /* kick all CPUs to restore ticks */ + for_each_possible_cpu(cpu) + resched_cpu(cpu); + forward_progress_guaranteed: /* * Here, every runnable task is guaranteed to make forward progress and diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h index 0b04626e8ca2..9c9284f91e38 100644 --- a/kernel/sched/ext.h +++ b/kernel/sched/ext.h @@ -82,6 +82,7 @@ int scx_fork(struct task_struct *p); void scx_post_fork(struct task_struct *p); void scx_cancel_fork(struct task_struct *p); int scx_check_setscheduler(struct task_struct *p, int policy); +bool scx_can_stop_tick(struct rq *rq); void init_sched_ext_class(void); __printf(2, 3) void scx_ops_error_type(enum scx_exit_type type, @@ -141,6 +142,7 @@ static inline void scx_post_fork(struct task_struct *p) {} static inline void scx_cancel_fork(struct task_struct *p) {} static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } +static inline bool scx_can_stop_tick(struct rq *rq) { return true; } static inline void init_sched_ext_class(void) {} static inline void scx_notify_sched_tick(void) {} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3c16caecd3a5..2447af22783c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -685,11 +685,17 @@ struct cfs_rq { }; #ifdef CONFIG_SCHED_CLASS_EXT +/* scx_rq->flags, protected by the rq lock */ +enum scx_rq_flags { + SCX_RQ_CAN_STOP_TICK = 1 << 0, +}; + struct scx_rq { struct scx_dispatch_q local_dsq; struct list_head watchdog_list; u64 ops_qseq; u32 nr_running; + u32 flags; cpumask_var_t cpus_to_kick; cpumask_var_t cpus_to_preempt; struct irq_work kick_cpus_irq_work; diff --git a/tools/sched_ext/scx_example_central.bpf.c b/tools/sched_ext/scx_example_central.bpf.c index 6b246bf308d0..376486674ddb 100644 --- a/tools/sched_ext/scx_example_central.bpf.c +++ b/tools/sched_ext/scx_example_central.bpf.c @@ -13,7 +13,26 @@ * through per-CPU BPF queues. The current design is chosen to maximally * utilize and verify various scx mechanisms such as LOCAL_ON dispatching. * - * b. Preemption + * b. Tickless operation + * + * All tasks are dispatched with the infinite slice which allows stopping the + * ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full + * parameter. The tickless operation can be observed through + * /proc/interrupts. + * + * Periodic switching is enforced by a periodic timer checking all CPUs and + * preempting them as necessary. Unfortunately, BPF timer currently doesn't + * have a way to pin to a specific CPU, so the periodic timer isn't pinned to + * the central CPU. + * + * c. Preemption + * + * Kthreads are unconditionally queued to the head of a matching local dsq + * and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always + * prioritized over user threads, which is required for ensuring forward + * progress as e.g. the periodic timer may run on a ksoftirqd and if the + * ksoftirqd gets starved by a user thread, there may not be anything else to + * vacate that user thread. * * SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the * next tasks. @@ -38,7 +57,7 @@ const volatile s32 central_cpu; const volatile u32 nr_cpu_ids; u64 nr_total, nr_locals, nr_queued, nr_lost_pids; -u64 nr_dispatches, nr_mismatches, nr_retries; +u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries; u64 nr_overflows; struct user_exit_info uei; @@ -51,6 +70,7 @@ struct { /* can't use percpu map due to bad lookups */ static bool cpu_gimme_task[MAX_CPUS]; +static u64 cpu_started_at[MAX_CPUS]; struct central_timer { struct bpf_timer timer; @@ -86,9 +106,22 @@ void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags) __sync_fetch_and_add(&nr_total, 1); + /* + * Push per-cpu kthreads at the head of local dsq's and preempt the + * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked + * behind other threads which is necessary for forward progress + * guarantee as we depend on the BPF timer which may run from ksoftirqd. + */ + if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) { + __sync_fetch_and_add(&nr_locals, 1); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF, + enq_flags | SCX_ENQ_PREEMPT); + return; + } + if (bpf_map_push_elem(&central_q, &pid, 0)) { __sync_fetch_and_add(&nr_overflows, 1); - scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, enq_flags); + scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags); return; } @@ -122,13 +155,13 @@ static int dispatch_a_task_loopfn(u32 idx, void *data) */ if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { __sync_fetch_and_add(&nr_mismatches, 1); - scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, 0); + scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0); bpf_task_release(p); return 0; } /* dispatch to the local and mark that @cpu doesn't need more tasks */ - scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0); + scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0); if (cpu != central_cpu) scx_bpf_kick_cpu(cpu, 0); @@ -200,12 +233,83 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev) } } +void BPF_STRUCT_OPS(central_running, struct task_struct *p) +{ + s32 cpu = scx_bpf_task_cpu(p); + u64 *started_at = MEMBER_VPTR(cpu_started_at, [cpu]); + if (started_at) + *started_at = bpf_ktime_get_ns() ?: 1; /* 0 indicates idle */ +} + +void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable) +{ + s32 cpu = scx_bpf_task_cpu(p); + u64 *started_at = MEMBER_VPTR(cpu_started_at, [cpu]); + if (started_at) + *started_at = 0; +} + +static int kick_cpus_loopfn(u32 idx, void *data) +{ + s32 cpu = (nr_timers + idx) % nr_cpu_ids; + u64 *nr_to_kick = data; + u64 now = bpf_ktime_get_ns(); + u64 *started_at; + s32 pid; + + if (cpu == central_cpu) + goto kick; + + /* kick iff there's something pending */ + if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) || + scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu)) + ; + else if (*nr_to_kick) + (*nr_to_kick)--; + else + return 0; + + /* and the current one exhausted its slice */ + started_at = MEMBER_VPTR(cpu_started_at, [cpu]); + if (started_at && *started_at && + vtime_before(now, *started_at + SCX_SLICE_DFL)) + return 0; +kick: + scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT); + return 0; +} + +static int central_timerfn(void *map, int *key, struct bpf_timer *timer) +{ + u64 nr_to_kick = nr_queued; + + bpf_loop(nr_cpu_ids, kick_cpus_loopfn, &nr_to_kick, 0); + bpf_timer_start(timer, TIMER_INTERVAL_NS, 0); + __sync_fetch_and_add(&nr_timers, 1); + return 0; +} + int BPF_STRUCT_OPS_SLEEPABLE(central_init) { + u32 key = 0; + struct bpf_timer *timer; + int ret; + if (switch_all) scx_bpf_switch_all(); - return scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1); + ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1); + if (ret) + return ret; + + timer = bpf_map_lookup_elem(&central_timer, &key); + if (!timer) + return -ESRCH; + + bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC); + bpf_timer_set_callback(timer, central_timerfn); + ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0); + return ret; } void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei) @@ -225,6 +329,8 @@ struct sched_ext_ops central_ops = { .select_cpu = (void *)central_select_cpu, .enqueue = (void *)central_enqueue, .dispatch = (void *)central_dispatch, + .running = (void *)central_running, + .stopping = (void *)central_stopping, .init = (void *)central_init, .exit = (void *)central_exit, .name = "central", diff --git a/tools/sched_ext/scx_example_central.c b/tools/sched_ext/scx_example_central.c index 14f6598c03df..fc01e5149371 100644 --- a/tools/sched_ext/scx_example_central.c +++ b/tools/sched_ext/scx_example_central.c @@ -76,7 +76,8 @@ int main(int argc, char **argv) skel->bss->nr_locals, skel->bss->nr_queued, skel->bss->nr_lost_pids); - printf(" dispatch:%10lu mismatch:%10lu retry:%10lu\n", + printf("timer :%10lu dispatch:%10lu mismatch:%10lu retry:%10lu\n", + skel->bss->nr_timers, skel->bss->nr_dispatches, skel->bss->nr_mismatches, skel->bss->nr_retries);

[22/30] sched_ext: Implement tickless support

Commit Message

Patch