@@ -19,6 +19,7 @@ enum scx_consts {
SCX_EXIT_MSG_LEN = 1024,
SCX_SLICE_DFL = 20 * NSEC_PER_MSEC,
+ SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */
};
/*
@@ -1200,13 +1200,16 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
/*
- * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
- * if there's more than one we need the tick for involuntary
- * preemption.
+ * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
+ * left. For CFS, if there's more than one we need the tick for
+ * involuntary preemption. For SCX, ask.
*/
if (!scx_switched_all() && rq->nr_running > 1)
return false;
+ if (scx_enabled() && !scx_can_stop_tick(rq))
+ return false;
+
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -383,7 +383,8 @@ static void update_curr_scx(struct rq *rq)
account_group_exec_runtime(curr, delta_exec);
cgroup_account_cputime(curr, delta_exec);
- curr->scx.slice -= min(curr->scx.slice, delta_exec);
+ if (curr->scx.slice != SCX_SLICE_INF)
+ curr->scx.slice -= min(curr->scx.slice, delta_exec);
}
static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
@@ -1251,6 +1252,20 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
scx_ops.running(p);
watchdog_unwatch_task(p, true);
+
+ /*
+ * @p is getting newly scheduled or got kicked after someone updated its
+ * slice. Refresh whether tick can be stopped. See can_stop_tick_scx().
+ */
+ if ((p->scx.slice == SCX_SLICE_INF) !=
+ (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
+ if (p->scx.slice == SCX_SLICE_INF)
+ rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
+ else
+ rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
+
+ sched_update_tick_dependency(rq);
+ }
}
static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
@@ -1742,6 +1757,26 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
return 0;
}
+#ifdef CONFIG_NO_HZ_FULL
+bool scx_can_stop_tick(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ if (scx_ops_disabling())
+ return false;
+
+ if (p->sched_class != &ext_sched_class)
+ return true;
+
+ /*
+ * @rq can consume from different dsq's, so we can't tell whether it
+ * needs the tick or not by looking at nr_running. Allow stopping ticks
+ * iff the BPF scheduler indicated so. See set_next_task_scx().
+ */
+ return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
+}
+#endif
+
/*
* Omitted operations:
*
@@ -1923,7 +1958,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
struct scx_task_iter sti;
struct task_struct *p;
const char *reason;
- int i, type;
+ int i, cpu, type;
type = atomic_read(&scx_exit_type);
while (true) {
@@ -2021,6 +2056,10 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
scx_task_iter_exit(&sti);
spin_unlock_irq(&scx_tasks_lock);
+ /* kick all CPUs to restore ticks */
+ for_each_possible_cpu(cpu)
+ resched_cpu(cpu);
+
forward_progress_guaranteed:
/*
* Here, every runnable task is guaranteed to make forward progress and
@@ -94,6 +94,7 @@ void scx_post_fork(struct task_struct *p);
void scx_cancel_fork(struct task_struct *p);
int balance_scx(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
int scx_check_setscheduler(struct task_struct *p, int policy);
+bool scx_can_stop_tick(struct rq *rq);
void init_sched_ext_class(void);
__printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
@@ -156,6 +157,7 @@ static inline int balance_scx(struct rq *rq, struct task_struct *prev,
struct rq_flags *rf) { return 0; }
static inline int scx_check_setscheduler(struct task_struct *p,
int policy) { return 0; }
+static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
static inline void init_sched_ext_class(void) {}
static inline void scx_notify_sched_tick(void) {}
@@ -690,11 +690,17 @@ struct cfs_rq {
};
#ifdef CONFIG_SCHED_CLASS_EXT
+/* scx_rq->flags, protected by the rq lock */
+enum scx_rq_flags {
+ SCX_RQ_CAN_STOP_TICK = 1 << 0,
+};
+
struct scx_rq {
struct scx_dispatch_q local_dsq;
struct list_head watchdog_list;
u64 ops_qseq;
u32 nr_running;
+ u32 flags;
#ifdef CONFIG_SMP
cpumask_var_t cpus_to_kick;
cpumask_var_t cpus_to_preempt;
@@ -14,7 +14,26 @@
* utilize and verify various scx mechanisms such as LOCAL_ON dispatching and
* consume_final().
*
- * b. Preemption
+ * b. Tickless operation
+ *
+ * All tasks are dispatched with the infinite slice which allows stopping the
+ * ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full
+ * parameter. The tickless operation can be observed through
+ * /proc/interrupts.
+ *
+ * Periodic switching is enforced by a periodic timer checking all CPUs and
+ * preempting them as necessary. Unfortunately, BPF timer currently doesn't
+ * have a way to pin to a specific CPU, so the periodic timer isn't pinned to
+ * the central CPU.
+ *
+ * c. Preemption
+ *
+ * Kthreads are unconditionally queued to the head of a matching local dsq
+ * and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always
+ * prioritized over user threads, which is required for ensuring forward
+ * progress as e.g. the periodic timer may run on a ksoftirqd and if the
+ * ksoftirqd gets starved by a user thread, there may not be anything else to
+ * vacate that user thread.
*
* SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the
* next tasks.
@@ -38,8 +57,18 @@ const volatile bool switch_all;
const volatile s32 central_cpu;
const volatile u32 nr_cpu_ids;
+/*
+ * XXX - kernel should be able to shut down the associated timers. For now,
+ * implement it manually. They should be bool but the verifier gets confused
+ * about the value range of bool variables when verifying the return value of
+ * the loopfns. Also, they can't be static because verification fails with BTF
+ * error message for some reason.
+ */
+int timer_running;
+int timer_kill;
+
u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
-u64 nr_dispatches, nr_mismatches, nr_overflows;
+u64 nr_timers, nr_dispatches, nr_mismatches, nr_overflows;
struct user_exit_info uei;
@@ -51,6 +80,7 @@ struct {
/* can't use percpu map due to bad lookups */
static bool cpu_gimme_task[MAX_CPUS];
+static u64 cpu_started_at[MAX_CPUS];
struct central_timer {
struct bpf_timer timer;
@@ -86,9 +116,22 @@ void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags)
__sync_fetch_and_add(&nr_total, 1);
+ /*
+ * Push per-cpu kthreads at the head of local dsq's and preempt the
+ * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked
+ * behind other threads which is necessary for forward progress
+ * guarantee as we depend on the BPF timer which may run from ksoftirqd.
+ */
+ if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
+ __sync_fetch_and_add(&nr_locals, 1);
+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
+ enq_flags | SCX_ENQ_PREEMPT);
+ return;
+ }
+
if (bpf_map_push_elem(¢ral_q, &pid, 0)) {
__sync_fetch_and_add(&nr_overflows, 1);
- scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, enq_flags);
+ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags);
return;
}
@@ -122,12 +165,12 @@ static int dispatch_a_task_loopfn(u32 idx, void *data)
*/
if (!scx_bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
__sync_fetch_and_add(&nr_mismatches, 1);
- scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, 0);
+ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
return 0;
}
/* dispatch to the local and mark that @cpu doesn't need more tasks */
- scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0);
if (cpu != central_cpu)
scx_bpf_kick_cpu(cpu, 0);
@@ -196,16 +239,119 @@ void BPF_STRUCT_OPS(central_consume_final, s32 cpu)
scx_bpf_consume(FALLBACK_DSQ_ID);
}
+void BPF_STRUCT_OPS(central_running, struct task_struct *p)
+{
+ s32 cpu = scx_bpf_task_cpu(p);
+ u64 *started_at = MEMBER_VPTR(cpu_started_at, [cpu]);
+ if (started_at)
+ *started_at = bpf_ktime_get_ns() ?: 1; /* 0 indicates idle */
+}
+
+void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable)
+{
+ s32 cpu = scx_bpf_task_cpu(p);
+ u64 *started_at = MEMBER_VPTR(cpu_started_at, [cpu]);
+ if (started_at)
+ *started_at = 0;
+}
+
+static int kick_cpus_loopfn(u32 idx, void *data)
+{
+ s32 cpu = (nr_timers + idx) % nr_cpu_ids;
+ u64 *nr_to_kick = data;
+ u64 now = bpf_ktime_get_ns();
+ u64 *started_at;
+ s32 pid;
+
+ if (cpu == central_cpu)
+ goto kick;
+
+ /* kick iff there's something pending */
+ if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) ||
+ scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu))
+ ;
+ else if (*nr_to_kick)
+ (*nr_to_kick)--;
+ else
+ return 0;
+
+ /* and the current one exhausted its slice */
+ started_at = MEMBER_VPTR(cpu_started_at, [cpu]);
+ if (started_at && *started_at &&
+ vtime_before(now, *started_at + SCX_SLICE_DFL))
+ return 0;
+kick:
+ scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT);
+ return 0;
+}
+
+static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+ u64 nr_to_kick = nr_queued;
+
+ if (timer_kill) {
+ timer_running = 0;
+ return 0;
+ }
+
+ bpf_loop(nr_cpu_ids, kick_cpus_loopfn, &nr_to_kick, 0);
+ bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
+ __sync_fetch_and_add(&nr_timers, 1);
+ return 0;
+}
+
int BPF_STRUCT_OPS(central_init)
{
+ u32 key = 0;
+ struct bpf_timer *timer;
+ int ret;
+
if (switch_all)
scx_bpf_switch_all();
- return scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+ ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+ if (ret)
+ return ret;
+
+ timer = bpf_map_lookup_elem(¢ral_timer, &key);
+ if (!timer)
+ return -ESRCH;
+
+ bpf_timer_init(timer, ¢ral_timer, CLOCK_MONOTONIC);
+ bpf_timer_set_callback(timer, central_timerfn);
+ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
+ timer_running = !ret;
+ return ret;
+}
+
+static int exit_wait_timer_nested_loopfn(u32 idx, void *data)
+{
+ u64 expiration = *(u64 *)data;
+
+ return !timer_running || vtime_before(expiration, bpf_ktime_get_ns());
+}
+
+static int exit_wait_timer_loopfn(u32 idx, void *data)
+{
+ u64 expiration = *(u64 *)data;
+
+ bpf_loop(1 << 23, exit_wait_timer_nested_loopfn, data, 0);
+ return !timer_running || vtime_before(expiration, bpf_ktime_get_ns());
}
void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
{
+ u64 expiration = bpf_ktime_get_ns() + 1000 * MS_TO_NS;
+
+ /*
+ * XXX - We just need to make sure that the timer body isn't running on
+ * exit. If we catch the timer while waiting, great. If not, it's still
+ * highly likely that the timer body won't run in the future. Once bpf
+ * can shut down associated timers, this hackery should go away.
+ */
+ timer_kill = 1;
+ bpf_loop(1 << 23, exit_wait_timer_loopfn, &expiration, 0);
+
uei_record(&uei, ei);
}
@@ -223,6 +369,8 @@ struct sched_ext_ops central_ops = {
.dispatch = (void *)central_dispatch,
.consume = (void *)central_consume,
.consume_final = (void *)central_consume_final,
+ .running = (void *)central_running,
+ .stopping = (void *)central_stopping,
.init = (void *)central_init,
.exit = (void *)central_exit,
.name = "central",
@@ -76,7 +76,8 @@ int main(int argc, char **argv)
skel->bss->nr_locals,
skel->bss->nr_queued,
skel->bss->nr_lost_pids);
- printf(" dispatch:%10lu mismatch:%10lu overflow:%10lu\n",
+ printf("timer:%10lu dispatch:%10lu mismatch:%10lu overflow:%10lu\n",
+ skel->bss->nr_timers,
skel->bss->nr_dispatches,
skel->bss->nr_mismatches,
skel->bss->nr_overflows);