@@ -583,6 +583,7 @@ struct sched_ext_ops {
struct scx_dispatch_q {
raw_spinlock_t lock;
struct list_head fifo; /* processed in dispatching order */
+ struct rb_root_cached priq; /* processed in p->scx.dsq_vtime order */
u32 nr;
u64 id;
struct rhash_head hash_node;
@@ -595,6 +596,7 @@ enum scx_ent_flags {
SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */
SCX_TASK_BAL_KEEP = 1 << 1, /* balance decided to keep current */
SCX_TASK_ENQ_LOCAL = 1 << 2, /* used by scx_select_cpu_dfl() to set SCX_ENQ_LOCAL */
+ SCX_TASK_ON_DSQ_PRIQ = 1 << 3, /* task is queued on the priority queue of a dsq */
SCX_TASK_OPS_PREPPED = 1 << 8, /* prepared for BPF scheduler enable */
SCX_TASK_OPS_ENABLED = 1 << 9, /* task has BPF scheduler enabled */
@@ -636,7 +638,10 @@ enum scx_kf_mask {
*/
struct sched_ext_entity {
struct scx_dispatch_q *dsq;
- struct list_head dsq_node;
+ struct {
+ struct list_head fifo; /* dispatch order */
+ struct rb_node priq; /* p->scx.dsq_vtime order */
+ } dsq_node;
struct list_head watchdog_node;
u32 flags; /* protected by rq lock */
u32 weight;
@@ -664,6 +669,15 @@ struct sched_ext_entity {
*/
u64 slice;
+ /*
+ * Used to order tasks when dispatching to the vtime-ordered priority
+ * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime()
+ * but can also be modified directly by the BPF scheduler. Modifying it
+ * while a task is queued on a dsq may mangle the ordering and is not
+ * recommended.
+ */
+ u64 dsq_vtime;
+
/*
* If set, reject future sched_setscheduler(2) calls updating the policy
* to %SCHED_EXT with -%EACCES.
@@ -105,7 +105,7 @@ struct task_struct init_task
#endif
#ifdef CONFIG_SCHED_CLASS_EXT
.scx = {
- .dsq_node = LIST_HEAD_INIT(init_task.scx.dsq_node),
+ .dsq_node.fifo = LIST_HEAD_INIT(init_task.scx.dsq_node.fifo),
.watchdog_node = LIST_HEAD_INIT(init_task.scx.watchdog_node),
.sticky_cpu = -1,
.holding_cpu = -1,
@@ -4495,7 +4495,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#ifdef CONFIG_SCHED_CLASS_EXT
p->scx.dsq = NULL;
- INIT_LIST_HEAD(&p->scx.dsq_node);
+ INIT_LIST_HEAD(&p->scx.dsq_node.fifo);
+ RB_CLEAR_NODE(&p->scx.dsq_node.priq);
INIT_LIST_HEAD(&p->scx.watchdog_node);
p->scx.flags = 0;
p->scx.weight = 0;
@@ -595,12 +595,25 @@ static void update_curr_scx(struct rq *rq)
}
}
+static bool scx_dsq_priq_less(struct rb_node *node_a,
+ const struct rb_node *node_b)
+{
+ const struct task_struct *a =
+ container_of(node_a, struct task_struct, scx.dsq_node.priq);
+ const struct task_struct *b =
+ container_of(node_b, struct task_struct, scx.dsq_node.priq);
+
+ return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);
+}
+
static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
u64 enq_flags)
{
bool is_local = dsq->id == SCX_DSQ_LOCAL;
- WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node));
+ WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.fifo));
+ WARN_ON_ONCE((p->scx.flags & SCX_TASK_ON_DSQ_PRIQ) ||
+ !RB_EMPTY_NODE(&p->scx.dsq_node.priq));
if (!is_local) {
raw_spin_lock(&dsq->lock);
@@ -613,10 +626,16 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
}
}
- if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
- list_add(&p->scx.dsq_node, &dsq->fifo);
- else
- list_add_tail(&p->scx.dsq_node, &dsq->fifo);
+ if (enq_flags & SCX_ENQ_DSQ_PRIQ) {
+ p->scx.flags |= SCX_TASK_ON_DSQ_PRIQ;
+ rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq,
+ scx_dsq_priq_less);
+ } else {
+ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
+ list_add(&p->scx.dsq_node.fifo, &dsq->fifo);
+ else
+ list_add_tail(&p->scx.dsq_node.fifo, &dsq->fifo);
+ }
dsq->nr++;
p->scx.dsq = dsq;
@@ -645,13 +664,31 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
}
}
+static void task_unlink_from_dsq(struct task_struct *p,
+ struct scx_dispatch_q *dsq)
+{
+ if (p->scx.flags & SCX_TASK_ON_DSQ_PRIQ) {
+ rb_erase_cached(&p->scx.dsq_node.priq, &dsq->priq);
+ RB_CLEAR_NODE(&p->scx.dsq_node.priq);
+ p->scx.flags &= ~SCX_TASK_ON_DSQ_PRIQ;
+ } else {
+ list_del_init(&p->scx.dsq_node.fifo);
+ }
+}
+
+static bool task_linked_on_dsq(struct task_struct *p)
+{
+ return !list_empty(&p->scx.dsq_node.fifo) ||
+ !RB_EMPTY_NODE(&p->scx.dsq_node.priq);
+}
+
static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p)
{
struct scx_dispatch_q *dsq = p->scx.dsq;
bool is_local = dsq == &scx_rq->local_dsq;
if (!dsq) {
- WARN_ON_ONCE(!list_empty(&p->scx.dsq_node));
+ WARN_ON_ONCE(task_linked_on_dsq(p));
/*
* When dispatching directly from the BPF scheduler to a local
* DSQ, the task isn't associated with any DSQ but
@@ -672,8 +709,8 @@ static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p)
*/
if (p->scx.holding_cpu < 0) {
/* @p must still be on @dsq, dequeue */
- WARN_ON_ONCE(list_empty(&p->scx.dsq_node));
- list_del_init(&p->scx.dsq_node);
+ WARN_ON_ONCE(!task_linked_on_dsq(p));
+ task_unlink_from_dsq(p, dsq);
dsq->nr--;
} else {
/*
@@ -682,7 +719,7 @@ static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p)
* holding_cpu which tells dispatch_to_local_dsq() that it lost
* the race.
*/
- WARN_ON_ONCE(!list_empty(&p->scx.dsq_node));
+ WARN_ON_ONCE(task_linked_on_dsq(p));
p->scx.holding_cpu = -1;
}
p->scx.dsq = NULL;
@@ -1146,33 +1183,52 @@ static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
#endif /* CONFIG_SMP */
+static bool task_can_run_on_rq(struct task_struct *p, struct rq *rq)
+{
+ return likely(test_rq_online(rq)) && !is_migration_disabled(p) &&
+ cpumask_test_cpu(cpu_of(rq), p->cpus_ptr);
+}
+
static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
struct scx_dispatch_q *dsq)
{
struct scx_rq *scx_rq = &rq->scx;
struct task_struct *p;
+ struct rb_node *rb_node;
struct rq *task_rq;
bool moved = false;
retry:
- if (list_empty(&dsq->fifo))
+ if (list_empty(&dsq->fifo) && !rb_first_cached(&dsq->priq))
return false;
raw_spin_lock(&dsq->lock);
- list_for_each_entry(p, &dsq->fifo, scx.dsq_node) {
+
+ list_for_each_entry(p, &dsq->fifo, scx.dsq_node.fifo) {
+ task_rq = task_rq(p);
+ if (rq == task_rq)
+ goto this_rq;
+ if (task_can_run_on_rq(p, rq))
+ goto remote_rq;
+ }
+
+ for (rb_node = rb_first_cached(&dsq->priq); rb_node;
+ rb_node = rb_next(rb_node)) {
+ p = container_of(rb_node, struct task_struct, scx.dsq_node.priq);
task_rq = task_rq(p);
if (rq == task_rq)
goto this_rq;
- if (likely(test_rq_online(rq)) && !is_migration_disabled(p) &&
- cpumask_test_cpu(cpu_of(rq), p->cpus_ptr))
+ if (task_can_run_on_rq(p, rq))
goto remote_rq;
}
+
raw_spin_unlock(&dsq->lock);
return false;
this_rq:
/* @dsq is locked and @p is on this rq */
WARN_ON_ONCE(p->scx.holding_cpu >= 0);
- list_move_tail(&p->scx.dsq_node, &scx_rq->local_dsq.fifo);
+ task_unlink_from_dsq(p, dsq);
+ list_add_tail(&p->scx.dsq_node.fifo, &scx_rq->local_dsq.fifo);
dsq->nr--;
scx_rq->local_dsq.nr++;
p->scx.dsq = &scx_rq->local_dsq;
@@ -1189,7 +1245,7 @@ static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
* move_task_to_local_dsq().
*/
WARN_ON_ONCE(p->scx.holding_cpu >= 0);
- list_del_init(&p->scx.dsq_node);
+ task_unlink_from_dsq(p, dsq);
dsq->nr--;
p->scx.holding_cpu = raw_smp_processor_id();
raw_spin_unlock(&dsq->lock);
@@ -1692,8 +1748,18 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
static struct task_struct *first_local_task(struct rq *rq)
{
- return list_first_entry_or_null(&rq->scx.local_dsq.fifo,
- struct task_struct, scx.dsq_node);
+ struct rb_node *rb_node;
+
+ if (!list_empty(&rq->scx.local_dsq.fifo))
+ return list_first_entry(&rq->scx.local_dsq.fifo,
+ struct task_struct, scx.dsq_node.fifo);
+
+ rb_node = rb_first_cached(&rq->scx.local_dsq.priq);
+ if (rb_node)
+ return container_of(rb_node,
+ struct task_struct, scx.dsq_node.priq);
+
+ return NULL;
}
static struct task_struct *pick_next_task_scx(struct rq *rq)
@@ -3360,6 +3426,9 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
if (off >= offsetof(struct task_struct, scx.slice) &&
off + size <= offsetofend(struct task_struct, scx.slice))
return SCALAR_VALUE;
+ if (off >= offsetof(struct task_struct, scx.dsq_vtime) &&
+ off + size <= offsetofend(struct task_struct, scx.dsq_vtime))
+ return SCALAR_VALUE;
if (off >= offsetof(struct task_struct, scx.disallow) &&
off + size <= offsetofend(struct task_struct, scx.disallow))
return SCALAR_VALUE;
@@ -3745,8 +3814,42 @@ void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
scx_dispatch_commit(p, dsq_id, enq_flags);
}
+/**
+ * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ
+ * @p: task_struct to dispatch
+ * @dsq_id: DSQ to dispatch to
+ * @slice: duration @p can run for in nsecs
+ * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
+ * @enq_flags: SCX_ENQ_*
+ *
+ * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id.
+ * Tasks queued into the priority queue are ordered by @vtime and always
+ * consumed after the tasks in the FIFO queue. All other aspects are identical
+ * to scx_bpf_dispatch().
+ *
+ * @vtime ordering is according to time_before64() which considers wrapping. A
+ * numerically larger vtime may indicate an earlier position in the ordering and
+ * vice-versa.
+ */
+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice,
+ u64 vtime, u64 enq_flags)
+{
+ if (!scx_dispatch_preamble(p, enq_flags))
+ return;
+
+ if (slice)
+ p->scx.slice = slice;
+ else
+ p->scx.slice = p->scx.slice ?: 1;
+
+ p->scx.dsq_vtime = vtime;
+
+ scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+}
+
BTF_SET8_START(scx_kfunc_ids_enqueue_dispatch)
BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
BTF_SET8_END(scx_kfunc_ids_enqueue_dispatch)
static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
@@ -63,6 +63,7 @@ enum scx_enq_flags {
__SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
+ SCX_ENQ_DSQ_PRIQ = 1LLU << 57,
};
enum scx_deq_flags {
@@ -57,6 +57,7 @@ s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
bool scx_bpf_consume(u64 dsq_id) __ksym;
u32 scx_bpf_dispatch_nr_slots(void) __ksym;
void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
@@ -38,6 +38,10 @@
* this isn't a real concern especially given the performance gain. Also, there
* are ways to mitigate the problem further by e.g. introducing an extra
* scheduling layer on cgroup delegation boundaries.
+ *
+ * The scheduler first picks the cgroup to run and then schedule the tasks
+ * within by using nested weighted vtime scheduling by default. The
+ * cgroup-internal scheduling can be switched to FIFO with the -f option.
*/
#include "scx_common.bpf.h"
#include "user_exit_info.h"
@@ -47,6 +51,7 @@ char _license[] SEC("license") = "GPL";
const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */
const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL;
+const volatile bool fifo_sched;
const volatile bool switch_partial;
u64 cvtime_now;
@@ -350,7 +355,21 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
if (!cgc)
goto out_release;
- scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
+ if (fifo_sched) {
+ scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
+ } else {
+ u64 tvtime = p->scx.dsq_vtime;
+
+ /*
+ * Limit the amount of budget that an idling task can accumulate
+ * to one slice.
+ */
+ if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL))
+ tvtime = cgc->tvtime_now - SCX_SLICE_DFL;
+
+ scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL,
+ tvtime, enq_flags);
+ }
cgrp_enqueued(cgrp, cgc);
out_release:
@@ -462,12 +481,40 @@ void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
bpf_cgroup_release(cgrp);
}
+void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
+{
+ struct cgroup *cgrp;
+ struct fcg_cgrp_ctx *cgc;
+
+ if (fifo_sched)
+ return;
+
+ cgrp = scx_bpf_task_cgroup(p);
+ cgc = find_cgrp_ctx(cgrp);
+ if (cgc) {
+ /*
+ * @cgc->tvtime_now always progresses forward as tasks start
+ * executing. The test and update can be performed concurrently
+ * from multiple CPUs and thus racy. Any error should be
+ * contained and temporary. Let's just live with it.
+ */
+ if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime))
+ cgc->tvtime_now = p->scx.dsq_vtime;
+ }
+ bpf_cgroup_release(cgrp);
+}
+
void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
{
struct fcg_task_ctx *taskc;
struct cgroup *cgrp;
struct fcg_cgrp_ctx *cgc;
+ /* scale the execution time by the inverse of the weight and charge */
+ if (!fifo_sched)
+ p->scx.dsq_vtime +=
+ (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
+
taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
if (!taskc) {
scx_bpf_error("task_ctx lookup failed");
@@ -811,6 +858,7 @@ struct sched_ext_ops flatcg_ops = {
.enqueue = (void *)fcg_enqueue,
.dispatch = (void *)fcg_dispatch,
.runnable = (void *)fcg_runnable,
+ .running = (void *)fcg_running,
.stopping = (void *)fcg_stopping,
.quiescent = (void *)fcg_quiescent,
.prep_enable = (void *)fcg_prep_enable,
@@ -30,6 +30,7 @@ const char help_fmt[] =
"\n"
" -s SLICE_US Override slice duration\n"
" -i INTERVAL Report interval\n"
+" -f Use FIFO scheduling instead of weighted vtime scheduling\n"
" -p Switch only tasks on SCHED_EXT policy intead of all\n"
" -h Display this help and exit\n";
@@ -149,6 +150,9 @@ int main(int argc, char **argv)
case 'd':
dump_cgrps = true;
break;
+ case 'f':
+ skel->rodata->fifo_sched = true;
+ break;
case 'p':
skel->rodata->switch_partial = true;
break;
@@ -2,11 +2,20 @@
/*
* A simple scheduler.
*
- * A simple global FIFO scheduler. It also demonstrates the following niceties.
+ * By default, it operates as a simple global weighted vtime scheduler and can
+ * be switched to FIFO scheduling. It also demonstrates the following niceties.
*
* - Statistics tracking how many tasks are queued to local and global dsq's.
* - Termination notification for userspace.
*
+ * While very simple, this scheduler should work reasonably well on CPUs with a
+ * uniform L3 cache topology. While preemption is not implemented, the fact that
+ * the scheduling queue is shared across all CPUs means that whatever is at the
+ * front of the queue is likely to be executed fairly quickly given enough
+ * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads
+ * but comes with the usual problems with FIFO scheduling where saturating
+ * threads can easily drown out interactive ones.
+ *
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
@@ -15,8 +24,10 @@
char _license[] SEC("license") = "GPL";
+const volatile bool fifo_sched;
const volatile bool switch_partial;
+static u64 vtime_now;
struct user_exit_info uei;
struct {
@@ -33,8 +44,18 @@ static void stat_inc(u32 idx)
(*cnt_p)++;
}
+static inline bool vtime_before(u64 a, u64 b)
+{
+ return (s64)(a - b) < 0;
+}
+
void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
{
+ /*
+ * If scx_select_cpu_dfl() is setting %SCX_ENQ_LOCAL, it indicates that
+ * running @p on its CPU directly shouldn't affect fairness. Just queue
+ * it on the local FIFO.
+ */
if (enq_flags & SCX_ENQ_LOCAL) {
stat_inc(0); /* count local queueing */
scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
@@ -42,7 +63,46 @@ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
}
stat_inc(1); /* count global queueing */
- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+
+ if (fifo_sched) {
+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+ } else {
+ u64 vtime = p->scx.dsq_vtime;
+
+ /*
+ * Limit the amount of budget that an idling task can accumulate
+ * to one slice.
+ */
+ if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
+ vtime = vtime_now - SCX_SLICE_DFL;
+
+ scx_bpf_dispatch_vtime(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, vtime,
+ enq_flags);
+ }
+}
+
+void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
+{
+ if (fifo_sched)
+ return;
+
+ /*
+ * Global vtime always progresses forward as tasks start executing. The
+ * test and update can be performed concurrently from multiple CPUs and
+ * thus racy. Any error should be contained and temporary. Let's just
+ * live with it.
+ */
+ if (vtime_before(vtime_now, p->scx.dsq_vtime))
+ vtime_now = p->scx.dsq_vtime;
+}
+
+void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
+{
+ if (fifo_sched)
+ return;
+
+ /* scale the execution time by the inverse of the weight and charge */
+ p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
}
s32 BPF_STRUCT_OPS(simple_init)
@@ -60,6 +120,8 @@ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
SEC(".struct_ops")
struct sched_ext_ops simple_ops = {
.enqueue = (void *)simple_enqueue,
+ .running = (void *)simple_running,
+ .stopping = (void *)simple_stopping,
.init = (void *)simple_init,
.exit = (void *)simple_exit,
.name = "simple",
@@ -21,6 +21,7 @@ const char help_fmt[] =
"\n"
"Usage: %s [-p]\n"
"\n"
+" -f Use FIFO scheduling instead of weighted vtime scheduling\n"
" -p Switch only tasks on SCHED_EXT policy intead of all\n"
" -h Display this help and exit\n";
@@ -65,8 +66,11 @@ int main(int argc, char **argv)
skel = scx_example_simple__open();
assert(skel);
- while ((opt = getopt(argc, argv, "ph")) != -1) {
+ while ((opt = getopt(argc, argv, "fph")) != -1) {
switch (opt) {
+ case 'f':
+ skel->rodata->fifo_sched = true;
+ break;
case 'p':
skel->rodata->switch_partial = true;
break;