@@ -19,6 +19,7 @@ enum scx_consts {
SCX_EXIT_MSG_LEN = 1024,
SCX_SLICE_DFL = 20 * NSEC_PER_MSEC,
+ SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */
};
/*
@@ -1222,13 +1222,16 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
/*
- * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
- * if there's more than one we need the tick for involuntary
- * preemption.
+ * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
+ * left. For CFS, if there's more than one we need the tick for
+ * involuntary preemption. For SCX, ask.
*/
if (!scx_switched_all() && rq->nr_running > 1)
return false;
+ if (scx_enabled() && !scx_can_stop_tick(rq))
+ return false;
+
return true;
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -477,7 +477,8 @@ static void update_curr_scx(struct rq *rq)
account_group_exec_runtime(curr, delta_exec);
cgroup_account_cputime(curr, delta_exec);
- curr->scx.slice -= min(curr->scx.slice, delta_exec);
+ if (curr->scx.slice != SCX_SLICE_INF)
+ curr->scx.slice -= min(curr->scx.slice, delta_exec);
}
static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
@@ -1399,6 +1400,20 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
SCX_CALL_OP(SCX_KF_REST, running, p);
watchdog_unwatch_task(p, true);
+
+ /*
+ * @p is getting newly scheduled or got kicked after someone updated its
+ * slice. Refresh whether tick can be stopped. See can_stop_tick_scx().
+ */
+ if ((p->scx.slice == SCX_SLICE_INF) !=
+ (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) {
+ if (p->scx.slice == SCX_SLICE_INF)
+ rq->scx.flags |= SCX_RQ_CAN_STOP_TICK;
+ else
+ rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK;
+
+ sched_update_tick_dependency(rq);
+ }
}
static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
@@ -1981,6 +1996,26 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
return 0;
}
+#ifdef CONFIG_NO_HZ_FULL
+bool scx_can_stop_tick(struct rq *rq)
+{
+ struct task_struct *p = rq->curr;
+
+ if (scx_ops_disabling())
+ return false;
+
+ if (p->sched_class != &ext_sched_class)
+ return true;
+
+ /*
+ * @rq can dispatch from different DSQs, so we can't tell whether it
+ * needs the tick or not by looking at nr_running. Allow stopping ticks
+ * iff the BPF scheduler indicated so. See set_next_task_scx().
+ */
+ return rq->scx.flags & SCX_RQ_CAN_STOP_TICK;
+}
+#endif
+
/*
* Omitted operations:
*
@@ -2141,7 +2176,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
struct rhashtable_iter rht_iter;
struct scx_dispatch_q *dsq;
const char *reason;
- int i, type;
+ int i, cpu, type;
type = atomic_read(&scx_exit_type);
while (true) {
@@ -2239,6 +2274,10 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
scx_task_iter_exit(&sti);
spin_unlock_irq(&scx_tasks_lock);
+ /* kick all CPUs to restore ticks */
+ for_each_possible_cpu(cpu)
+ resched_cpu(cpu);
+
forward_progress_guaranteed:
/*
* Here, every runnable task is guaranteed to make forward progress and
@@ -102,6 +102,7 @@ int scx_fork(struct task_struct *p);
void scx_post_fork(struct task_struct *p);
void scx_cancel_fork(struct task_struct *p);
int scx_check_setscheduler(struct task_struct *p, int policy);
+bool scx_can_stop_tick(struct rq *rq);
void init_sched_ext_class(void);
__printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
@@ -162,6 +163,7 @@ static inline void scx_post_fork(struct task_struct *p) {}
static inline void scx_cancel_fork(struct task_struct *p) {}
static inline int scx_check_setscheduler(struct task_struct *p,
int policy) { return 0; }
+static inline bool scx_can_stop_tick(struct rq *rq) { return true; }
static inline void init_sched_ext_class(void) {}
static inline void scx_notify_sched_tick(void) {}
@@ -686,12 +686,18 @@ struct cfs_rq {
};
#ifdef CONFIG_SCHED_CLASS_EXT
+/* scx_rq->flags, protected by the rq lock */
+enum scx_rq_flags {
+ SCX_RQ_CAN_STOP_TICK = 1 << 0,
+};
+
struct scx_rq {
struct scx_dispatch_q local_dsq;
struct list_head watchdog_list;
u64 ops_qseq;
u64 extra_enq_flags; /* see move_task_to_local_dsq() */
u32 nr_running;
+ u32 flags;
cpumask_var_t cpus_to_kick;
cpumask_var_t cpus_to_preempt;
struct irq_work kick_cpus_irq_work;
@@ -13,7 +13,26 @@
* through per-CPU BPF queues. The current design is chosen to maximally
* utilize and verify various SCX mechanisms such as LOCAL_ON dispatching.
*
- * b. Preemption
+ * b. Tickless operation
+ *
+ * All tasks are dispatched with the infinite slice which allows stopping the
+ * ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full
+ * parameter. The tickless operation can be observed through
+ * /proc/interrupts.
+ *
+ * Periodic switching is enforced by a periodic timer checking all CPUs and
+ * preempting them as necessary. Unfortunately, BPF timer currently doesn't
+ * have a way to pin to a specific CPU, so the periodic timer isn't pinned to
+ * the central CPU.
+ *
+ * c. Preemption
+ *
+ * Kthreads are unconditionally queued to the head of a matching local dsq
+ * and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always
+ * prioritized over user threads, which is required for ensuring forward
+ * progress as e.g. the periodic timer may run on a ksoftirqd and if the
+ * ksoftirqd gets starved by a user thread, there may not be anything else to
+ * vacate that user thread.
*
* SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the
* next tasks.
@@ -42,7 +61,7 @@ const volatile s32 central_cpu;
const volatile u32 nr_cpu_ids = 64; /* !0 for veristat, set during init */
u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
-u64 nr_dispatches, nr_mismatches, nr_retries;
+u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries;
u64 nr_overflows;
struct user_exit_info uei;
@@ -55,6 +74,7 @@ struct {
/* can't use percpu map due to bad lookups */
static bool cpu_gimme_task[MAX_CPUS];
+static u64 cpu_started_at[MAX_CPUS];
struct central_timer {
struct bpf_timer timer;
@@ -67,6 +87,11 @@ struct {
__type(value, struct central_timer);
} central_timer SEC(".maps");
+static bool vtime_before(u64 a, u64 b)
+{
+ return (s64)(a - b) < 0;
+}
+
s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p,
s32 prev_cpu, u64 wake_flags)
{
@@ -85,9 +110,22 @@ void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags)
__sync_fetch_and_add(&nr_total, 1);
+ /*
+ * Push per-cpu kthreads at the head of local dsq's and preempt the
+ * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked
+ * behind other threads which is necessary for forward progress
+ * guarantee as we depend on the BPF timer which may run from ksoftirqd.
+ */
+ if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
+ __sync_fetch_and_add(&nr_locals, 1);
+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
+ enq_flags | SCX_ENQ_PREEMPT);
+ return;
+ }
+
if (bpf_map_push_elem(¢ral_q, &pid, 0)) {
__sync_fetch_and_add(&nr_overflows, 1);
- scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, enq_flags);
+ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags);
return;
}
@@ -120,13 +158,13 @@ static bool dispatch_to_cpu(s32 cpu)
*/
if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
__sync_fetch_and_add(&nr_mismatches, 1);
- scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_DFL, 0);
+ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
bpf_task_release(p);
continue;
}
/* dispatch to local and mark that @cpu doesn't need more */
- scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0);
if (cpu != central_cpu)
scx_bpf_kick_cpu(cpu, 0);
@@ -194,12 +232,81 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
}
}
+void BPF_STRUCT_OPS(central_running, struct task_struct *p)
+{
+ s32 cpu = scx_bpf_task_cpu(p);
+ u64 *started_at = MEMBER_VPTR(cpu_started_at, [cpu]);
+ if (started_at)
+ *started_at = bpf_ktime_get_ns() ?: 1; /* 0 indicates idle */
+}
+
+void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable)
+{
+ s32 cpu = scx_bpf_task_cpu(p);
+ u64 *started_at = MEMBER_VPTR(cpu_started_at, [cpu]);
+ if (started_at)
+ *started_at = 0;
+}
+
+static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+ u64 now = bpf_ktime_get_ns();
+ u64 nr_to_kick = nr_queued;
+ s32 i;
+
+ bpf_for(i, 0, nr_cpu_ids) {
+ s32 cpu = (nr_timers + i) % nr_cpu_ids;
+ u64 *started_at;
+
+ if (cpu == central_cpu)
+ continue;
+
+ /* kick iff the current one exhausted its slice */
+ started_at = MEMBER_VPTR(cpu_started_at, [cpu]);
+ if (started_at && *started_at &&
+ vtime_before(now, *started_at + SCX_SLICE_DFL))
+ continue;
+
+ /* and there's something pending */
+ if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) ||
+ scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu))
+ ;
+ else if (nr_to_kick)
+ nr_to_kick--;
+ else
+ continue;
+
+ scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT);
+ }
+
+ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
+
+ bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
+ __sync_fetch_and_add(&nr_timers, 1);
+ return 0;
+}
+
int BPF_STRUCT_OPS_SLEEPABLE(central_init)
{
+ u32 key = 0;
+ struct bpf_timer *timer;
+ int ret;
+
if (!switch_partial)
scx_bpf_switch_all();
- return scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+ ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+ if (ret)
+ return ret;
+
+ timer = bpf_map_lookup_elem(¢ral_timer, &key);
+ if (!timer)
+ return -ESRCH;
+
+ bpf_timer_init(timer, ¢ral_timer, CLOCK_MONOTONIC);
+ bpf_timer_set_callback(timer, central_timerfn);
+ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
+ return ret;
}
void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
@@ -219,6 +326,8 @@ struct sched_ext_ops central_ops = {
.select_cpu = (void *)central_select_cpu,
.enqueue = (void *)central_enqueue,
.dispatch = (void *)central_dispatch,
+ .running = (void *)central_running,
+ .stopping = (void *)central_stopping,
.init = (void *)central_init,
.exit = (void *)central_exit,
.name = "central",
@@ -76,7 +76,8 @@ int main(int argc, char **argv)
skel->bss->nr_locals,
skel->bss->nr_queued,
skel->bss->nr_lost_pids);
- printf(" dispatch:%10lu mismatch:%10lu retry:%10lu\n",
+ printf("timer :%10lu dispatch:%10lu mismatch:%10lu retry:%10lu\n",
+ skel->bss->nr_timers,
skel->bss->nr_dispatches,
skel->bss->nr_mismatches,
skel->bss->nr_retries);