@@ -19,6 +19,7 @@ enum scx_consts {
SCX_EXIT_MSG_LEN = 1024,
SCX_SLICE_DFL = 20 * NSEC_PER_MSEC,
+ SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */
};
/*
@@ -1232,13 +1232,16 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
/*
- * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
- * if there's more than one we need the tick for involuntary
- * preemption.
+ * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
+ * left. For CFS, if there's more than one we need the tick for
+ * involuntary preemption. For SCX, ask.
*/
if (!scx_switched_all() && rq->nr_running > 1)
return false;
+ if (scx_enabled() && !scx_can_stop_tick(rq))
+ return false;
+
/*
* If there is one task and it has CFS runtime bandwidth constraints
* and it's on the cpu now we don't want to stop the tick.
@@ -488,7 +488,8 @@ static void update_curr_scx(struct rq *rq)
account_group_exec_runtime(curr, delta_exec);
cgroup_account_cputime(curr, delta_exec);
- curr->scx.slice -= min(curr->scx.slice, delta_exec);
+ if (curr->scx.slice != SCX_SLICE_INF)
+ curr->scx.slice -= min(curr->scx.slice, delta_exec);
}
static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
@@ -1411,6 +1412,20 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
SCX_CALL_OP(SCX_KF_REST, running, p);
watchdog_unwatch_task(p, true);
+
+ /*
+ * @p is getting newly scheduled or got kicked after someone updated its
+ * slice. Refresh whether tick can be stopped. See scx_can_stop_tick().
+ */
+ if ((p->scx.slice == SCX_SLICE_INF) !=
+ (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
+ if (p->scx.slice == SCX_SLICE_INF)
+ rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
+ else
+ rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
+
+ sched_update_tick_dependency(rq);
+ }
}
static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
@@ -1993,6 +2008,26 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
return 0;
}
+#ifdef CONFIG_NO_HZ_FULL
+bool scx_can_stop_tick(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ if (scx_ops_disabling())
+ return false;
+
+ if (p->sched_class != &ext_sched_class)
+ return true;
+
+ /*
+ * @rq can dispatch from different DSQs, so we can't tell whether it
+ * needs the tick or not by looking at nr_running. Allow stopping ticks
+ * iff the BPF scheduler indicated so. See set_next_task_scx().
+ */
+ return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
+}
+#endif
+
/*
* Omitted operations:
*
@@ -2152,7 +2187,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
struct rhashtable_iter rht_iter;
struct scx_dispatch_q *dsq;
const char *reason;
- int i, kind;
+ int i, cpu, kind;
kind = atomic_read(&scx_exit_kind);
while (true) {
@@ -2250,6 +2285,10 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
scx_task_iter_exit(&sti);
spin_unlock_irq(&scx_tasks_lock);
+ /* kick all CPUs to restore ticks */
+ for_each_possible_cpu(cpu)
+ resched_cpu(cpu);
+
forward_progress_guaranteed:
/*
* Here, every runnable task is guaranteed to make forward progress and
@@ -102,6 +102,7 @@ int scx_fork(struct task_struct *p);
void scx_post_fork(struct task_struct *p);
void scx_cancel_fork(struct task_struct *p);
int scx_check_setscheduler(struct task_struct *p, int policy);
+bool scx_can_stop_tick(struct rq *rq);
void init_sched_ext_class(void);
__printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind,
@@ -162,6 +163,7 @@ static inline void scx_post_fork(struct task_struct *p) {}
static inline void scx_cancel_fork(struct task_struct *p) {}
static inline int scx_check_setscheduler(struct task_struct *p,
int policy) { return 0; }
+static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
static inline void init_sched_ext_class(void) {}
static inline void scx_notify_sched_tick(void) {}
@@ -673,12 +673,18 @@ struct cfs_rq {
};
#ifdef CONFIG_SCHED_CLASS_EXT
+/* scx_rq->flags, protected by the rq lock */
+enum scx_rq_flags {
+ SCX_RQ_CAN_STOP_TICK = 1 << 0,
+};
+
struct scx_rq {
struct scx_dispatch_q local_dsq;
struct list_head watchdog_list;
unsigned long ops_qseq;
u64 extra_enq_flags; /* see move_task_to_local_dsq() */
u32 nr_running;
+ u32 flags;
cpumask_var_t cpus_to_kick;
cpumask_var_t cpus_to_preempt;
struct irq_work kick_cpus_irq_work;
@@ -13,7 +13,26 @@
* through per-CPU BPF queues. The current design is chosen to maximally
* utilize and verify various SCX mechanisms such as LOCAL_ON dispatching.
*
- * b. Preemption
+ * b. Tickless operation
+ *
+ * All tasks are dispatched with the infinite slice which allows stopping the
+ * ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full
+ * parameter. The tickless operation can be observed through
+ * /proc/interrupts.
+ *
+ * Periodic switching is enforced by a periodic timer checking all CPUs and
+ * preempting them as necessary. Unfortunately, BPF timer currently doesn't
+ * have a way to pin to a specific CPU, so the periodic timer isn't pinned to
+ * the central CPU.
+ *
+ * c. Preemption
+ *
+ * Kthreads are unconditionally queued to the head of a matching local dsq
+ * and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always
+ * prioritized over user threads, which is required for ensuring forward
+ * progress as e.g. the periodic timer may run on a ksoftirqd and if the
+ * ksoftirqd gets starved by a user thread, there may not be anything else to
+ * vacate that user thread.
*
* SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the
* next tasks.
@@ -32,6 +51,8 @@ char _license[] SEC("license") = "GPL";
enum {
FALLBACK_DSQ_ID = 0,
+ MS_TO_NS = 1000LLU * 1000,
+ TIMER_INTERVAL_NS = 1 * MS_TO_NS,
};
const volatile bool switch_partial;
@@ -40,7 +61,7 @@ const volatile u32 nr_cpu_ids = 1; /* !0 for veristat, set during init */
const volatile u64 slice_ns = SCX_SLICE_DFL;
u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
-u64 nr_dispatches, nr_mismatches, nr_retries;
+u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries;
u64 nr_overflows;
struct user_exit_info uei;
@@ -53,6 +74,23 @@ struct {
/* can't use percpu map due to bad lookups */
bool RESIZABLE_ARRAY(data, cpu_gimme_task);
+u64 RESIZABLE_ARRAY(data, cpu_started_at);
+
+struct central_timer {
+ struct bpf_timer timer;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, u32);
+ __type(value, struct central_timer);
+} central_timer SEC(".maps");
+
+static bool vtime_before(u64 a, u64 b)
+{
+ return (s64)(a - b) < 0;
+}
s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
@@ -72,9 +110,22 @@ void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags)
__sync_fetch_and_add(&nr_total, 1);
+ /*
+ * Push per-cpu kthreads at the head of local dsq's and preempt the
+ * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked
+ * behind other threads which is necessary for forward progress
+ * guarantee as we depend on the BPF timer which may run from ksoftirqd.
+ */
+ if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
+ __sync_fetch_and_add(&nr_locals, 1);
+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
+ enq_flags | SCX_ENQ_PREEMPT);
+ return;
+ }
+
if (bpf_map_push_elem(¢ral_q, &pid, 0)) {
__sync_fetch_and_add(&nr_overflows, 1);
- scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, enq_flags);
+ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags);
return;
}
@@ -107,13 +158,13 @@ static bool dispatch_to_cpu(s32 cpu)
*/
if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
__sync_fetch_and_add(&nr_mismatches, 1);
- scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, 0);
+ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
bpf_task_release(p);
continue;
}
/* dispatch to local and mark that @cpu doesn't need more */
- scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0);
if (cpu != central_cpu)
scx_bpf_kick_cpu(cpu, 0);
@@ -181,12 +232,89 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
}
}
+void BPF_STRUCT_OPS(central_running, struct task_struct *p)
+{
+ s32 cpu = scx_bpf_task_cpu(p);
+ u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
+ if (started_at)
+ *started_at = bpf_ktime_get_ns() ?: 1; /* 0 indicates idle */
+}
+
+void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable)
+{
+ s32 cpu = scx_bpf_task_cpu(p);
+ u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
+ if (started_at)
+ *started_at = 0;
+}
+
+static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+ u64 now = bpf_ktime_get_ns();
+ u64 nr_to_kick = nr_queued;
+ s32 i, curr_cpu;
+
+ curr_cpu = bpf_get_smp_processor_id();
+ if (curr_cpu != central_cpu) {
+ scx_bpf_error("Central timer ran on CPU %d, not central CPU %d",
+ curr_cpu, central_cpu);
+ return 0;
+ }
+
+ bpf_for(i, 0, nr_cpu_ids) {
+ s32 cpu = (nr_timers + i) % nr_cpu_ids;
+ u64 *started_at;
+
+ if (cpu == central_cpu)
+ continue;
+
+ /* kick iff the current one exhausted its slice */
+ started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
+ if (started_at && *started_at &&
+ vtime_before(now, *started_at + slice_ns))
+ continue;
+
+ /* and there's something pending */
+ if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) ||
+ scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu))
+ ;
+ else if (nr_to_kick)
+ nr_to_kick--;
+ else
+ continue;
+
+ scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT);
+ }
+
+ bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN);
+ __sync_fetch_and_add(&nr_timers, 1);
+ return 0;
+}
+
int BPF_STRUCT_OPS_SLEEPABLE(central_init)
{
+ u32 key = 0;
+ struct bpf_timer *timer;
+ int ret;
+
if (!switch_partial)
scx_bpf_switch_all();
- return scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+ ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+ if (ret)
+ return ret;
+
+ timer = bpf_map_lookup_elem(¢ral_timer, &key);
+ if (!timer)
+ return -ESRCH;
+
+ if (bpf_get_smp_processor_id() != central_cpu)
+ return -EINVAL;
+
+ bpf_timer_init(timer, ¢ral_timer, CLOCK_MONOTONIC);
+ bpf_timer_set_callback(timer, central_timerfn);
+ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN);
+ return ret;
}
void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
@@ -206,6 +334,8 @@ struct sched_ext_ops central_ops = {
.select_cpu = (void *)central_select_cpu,
.enqueue = (void *)central_enqueue,
.dispatch = (void *)central_dispatch,
+ .running = (void *)central_running,
+ .stopping = (void *)central_stopping,
.init = (void *)central_init,
.exit = (void *)central_exit,
.name = "central",
@@ -39,6 +39,7 @@ int main(int argc, char **argv)
struct bpf_link *link;
__u64 seq = 0;
__s32 opt;
+ cpu_set_t *cpuset;
signal(SIGINT, sigint_handler);
signal(SIGTERM, sigint_handler);
@@ -70,9 +71,30 @@ int main(int argc, char **argv)
/* Resize arrays so their element count is equal to cpu count. */
RESIZE_ARRAY(data, cpu_gimme_task, skel->rodata->nr_cpu_ids);
+ RESIZE_ARRAY(data, cpu_started_at, skel->rodata->nr_cpu_ids);
SCX_BUG_ON(scx_central__load(skel), "Failed to load skel");
+ /*
+ * Affinitize the loading thread to the central CPU, as:
+ * - That's where the BPF timer is first invoked in the BPF program.
+ * - We probably don't want this user space component to take up a core
+ * from a task that would benefit from avoiding preemption on one of
+ * the tickless cores.
+ *
+ * Until BPF supports pinning the timer, it's not guaranteed that it
+ * will always be invoked on the central CPU. In practice, this
+ * suffices the majority of the time.
+ */
+ cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids);
+ SCX_BUG_ON(!cpuset, "Failed to allocate cpuset");
+ CPU_ZERO(cpuset);
+ CPU_SET(skel->rodata->central_cpu, cpuset);
+ SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset),
+ "Failed to affinitize to central CPU %d (max %d)",
+ skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1);
+ CPU_FREE(cpuset);
+
link = bpf_map__attach_struct_ops(skel->maps.central_ops);
SCX_BUG_ON(!link, "Failed to attach struct_ops");
@@ -83,7 +105,8 @@ int main(int argc, char **argv)
skel->bss->nr_locals,
skel->bss->nr_queued,
skel->bss->nr_lost_pids);
- printf(" dispatch:%10lu mismatch:%10lu retry:%10lu\n",
+ printf("timer :%10lu dispatch:%10lu mismatch:%10lu retry:%10lu\n",
+ skel->bss->nr_timers,
skel->bss->nr_dispatches,
skel->bss->nr_mismatches,
skel->bss->nr_retries);