@@ -59,6 +59,7 @@ enum scx_exit_type {
SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
+ SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */
};
/*
@@ -314,6 +315,15 @@ struct sched_ext_ops {
*/
u64 flags;
+ /**
+ * timeout_ms - The maximum amount of time, in milliseconds, that a
+ * runnable task should be able to wait before being scheduled. The
+ * maximum timeout may not exceed the default timeout of 30 seconds.
+ *
+ * Defaults to the maximum allowed timeout value of 30 seconds.
+ */
+ u32 timeout_ms;
+
/**
* name - BPF scheduler's name
*
@@ -347,6 +357,7 @@ enum scx_ent_flags {
SCX_TASK_OPS_PREPPED = 1 << 8, /* prepared for BPF scheduler enable */
SCX_TASK_OPS_ENABLED = 1 << 9, /* task has BPF scheduler enabled */
+ SCX_TASK_WATCHDOG_RESET = 1 << 16, /* task watchdog counter should be reset */
SCX_TASK_DEQD_FOR_SLEEP = 1 << 17, /* last dequeue was for SLEEP */
SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */
@@ -380,12 +391,14 @@ enum scx_kf_mask {
struct sched_ext_entity {
struct scx_dispatch_q *dsq;
struct list_head dsq_node;
+ struct list_head watchdog_node;
u32 flags; /* protected by rq lock */
u32 weight;
s32 sticky_cpu;
s32 holding_cpu;
u32 kf_mask; /* see scx_kf_mask above */
atomic64_t ops_state;
+ unsigned long runnable_at;
/* BPF scheduler modifiable fields */
@@ -106,9 +106,11 @@ struct task_struct init_task
#ifdef CONFIG_SCHED_CLASS_EXT
.scx = {
.dsq_node = LIST_HEAD_INIT(init_task.scx.dsq_node),
+ .watchdog_node = LIST_HEAD_INIT(init_task.scx.watchdog_node),
.sticky_cpu = -1,
.holding_cpu = -1,
.ops_state = ATOMIC_INIT(0),
+ .runnable_at = INITIAL_JIFFIES,
.slice = SCX_SLICE_DFL,
},
#endif
@@ -4508,12 +4508,14 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#ifdef CONFIG_SCHED_CLASS_EXT
p->scx.dsq = NULL;
INIT_LIST_HEAD(&p->scx.dsq_node);
+ INIT_LIST_HEAD(&p->scx.watchdog_node);
p->scx.flags = 0;
p->scx.weight = 0;
p->scx.sticky_cpu = -1;
p->scx.holding_cpu = -1;
p->scx.kf_mask = 0;
atomic64_set(&p->scx.ops_state, 0);
+ p->scx.runnable_at = INITIAL_JIFFIES;
p->scx.slice = SCX_SLICE_DFL;
#endif
@@ -5684,6 +5686,7 @@ void scheduler_tick(void)
if (sched_feat(LATENCY_WARN) && resched_latency)
resched_latency_warn(cpu, resched_latency);
+ scx_notify_sched_tick();
perf_event_task_tick();
#ifdef CONFIG_SMP
@@ -9,6 +9,7 @@
enum scx_internal_consts {
SCX_NR_ONLINE_OPS = SCX_OP_IDX(init),
SCX_DSP_DFL_MAX_BATCH = 32,
+ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ,
};
enum scx_ops_enable_state {
@@ -87,6 +88,23 @@ static struct scx_exit_info scx_exit_info;
static atomic64_t scx_nr_rejected = ATOMIC64_INIT(0);
+/*
+ * The maximum amount of time in jiffies that a task may be runnable without
+ * being scheduled on a CPU. If this timeout is exceeded, it will trigger
+ * scx_ops_error().
+ */
+unsigned long scx_watchdog_timeout;
+
+/*
+ * The last time the delayed work was run. This delayed work relies on
+ * ksoftirqd being able to run to service timer interrupts, so it's possible
+ * that this work itself could get wedged. To account for this, we check that
+ * it's not stalled in the timer tick, and trigger an error if it is.
+ */
+unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
+
+static struct delayed_work scx_watchdog_work;
+
/* idle tracking */
#ifdef CONFIG_SMP
#ifdef CONFIG_CPUMASK_OFFSTACK
@@ -145,10 +163,6 @@ static DEFINE_PER_CPU(struct scx_dsp_ctx, scx_dsp_ctx);
void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
u64 enq_flags);
-__printf(2, 3) static void scx_ops_error_type(enum scx_exit_type type,
- const char *fmt, ...);
-#define scx_ops_error(fmt, args...) \
- scx_ops_error_type(SCX_EXIT_ERROR, fmt, ##args)
struct scx_task_iter {
struct sched_ext_entity cursor;
@@ -698,6 +712,27 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
dispatch_enqueue(&scx_dsq_global, p, enq_flags);
}
+static bool watchdog_task_watched(const struct task_struct *p)
+{
+ return !list_empty(&p->scx.watchdog_node);
+}
+
+static void watchdog_watch_task(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_rq_held(rq);
+ if (p->scx.flags & SCX_TASK_WATCHDOG_RESET)
+ p->scx.runnable_at = jiffies;
+ p->scx.flags &= ~SCX_TASK_WATCHDOG_RESET;
+ list_add_tail(&p->scx.watchdog_node, &rq->scx.watchdog_list);
+}
+
+static void watchdog_unwatch_task(struct task_struct *p, bool reset_timeout)
+{
+ list_del_init(&p->scx.watchdog_node);
+ if (reset_timeout)
+ p->scx.flags |= SCX_TASK_WATCHDOG_RESET;
+}
+
static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
{
int sticky_cpu = p->scx.sticky_cpu;
@@ -716,9 +751,12 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
sticky_cpu = cpu_of(rq);
- if (p->scx.flags & SCX_TASK_QUEUED)
+ if (p->scx.flags & SCX_TASK_QUEUED) {
+ WARN_ON_ONCE(!watchdog_task_watched(p));
return;
+ }
+ watchdog_watch_task(rq, p);
p->scx.flags |= SCX_TASK_QUEUED;
rq->scx.nr_running++;
add_nr_running(rq, 1);
@@ -730,6 +768,8 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
{
u64 opss;
+ watchdog_unwatch_task(p, false);
+
/* acquire ensures that we see the preceding updates on QUEUED */
opss = atomic64_read_acquire(&p->scx.ops_state);
@@ -774,8 +814,10 @@ static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
{
struct scx_rq *scx_rq = &rq->scx;
- if (!(p->scx.flags & SCX_TASK_QUEUED))
+ if (!(p->scx.flags & SCX_TASK_QUEUED)) {
+ WARN_ON_ONCE(watchdog_task_watched(p));
return;
+ }
ops_dequeue(p, deq_flags);
@@ -1299,6 +1341,8 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
}
p->se.exec_start = rq_clock_task(rq);
+
+ watchdog_unwatch_task(p, true);
}
static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
@@ -1342,11 +1386,14 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
*/
if (p->scx.flags & SCX_TASK_BAL_KEEP) {
p->scx.flags &= ~SCX_TASK_BAL_KEEP;
+ watchdog_watch_task(rq, p);
dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
return;
}
if (p->scx.flags & SCX_TASK_QUEUED) {
+ watchdog_watch_task(rq, p);
+
/*
* If @p has slice left and balance_scx() didn't tag it for
* keeping, @p is getting preempted by a higher priority
@@ -1604,6 +1651,49 @@ static void reset_idle_masks(void) {}
#endif /* CONFIG_SMP */
+static bool check_rq_for_timeouts(struct rq *rq)
+{
+ struct task_struct *p;
+ struct rq_flags rf;
+ bool timed_out = false;
+
+ rq_lock_irqsave(rq, &rf);
+ list_for_each_entry(p, &rq->scx.watchdog_list, scx.watchdog_node) {
+ unsigned long last_runnable = p->scx.runnable_at;
+
+ if (unlikely(time_after(jiffies,
+ last_runnable + scx_watchdog_timeout))) {
+ u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
+
+ scx_ops_error_type(SCX_EXIT_ERROR_STALL,
+ "%s[%d] failed to run for %u.%03us",
+ p->comm, p->pid,
+ dur_ms / 1000, dur_ms % 1000);
+ timed_out = true;
+ break;
+ }
+ }
+ rq_unlock_irqrestore(rq, &rf);
+
+ return timed_out;
+}
+
+static void scx_watchdog_workfn(struct work_struct *work)
+{
+ int cpu;
+
+ scx_watchdog_timestamp = jiffies;
+
+ for_each_online_cpu(cpu) {
+ if (unlikely(check_rq_for_timeouts(cpu_rq(cpu))))
+ break;
+
+ cond_resched();
+ }
+ queue_delayed_work(system_unbound_wq, to_delayed_work(work),
+ scx_watchdog_timeout / 2);
+}
+
static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
{
update_curr_scx(rq);
@@ -1635,7 +1725,7 @@ static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg)
}
}
- p->scx.flags |= SCX_TASK_OPS_PREPPED;
+ p->scx.flags |= (SCX_TASK_OPS_PREPPED | SCX_TASK_WATCHDOG_RESET);
return 0;
}
@@ -1967,6 +2057,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
break;
}
+ cancel_delayed_work_sync(&scx_watchdog_work);
+
switch (type) {
case SCX_EXIT_UNREG:
reason = "BPF scheduler unregistered";
@@ -1980,6 +2072,9 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
case SCX_EXIT_ERROR_BPF:
reason = "scx_bpf_error";
break;
+ case SCX_EXIT_ERROR_STALL:
+ reason = "runnable task stall";
+ break;
default:
reason = "<UNKNOWN>";
}
@@ -2164,8 +2259,8 @@ static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
-__printf(2, 3) static void scx_ops_error_type(enum scx_exit_type type,
- const char *fmt, ...)
+__printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
+ const char *fmt, ...)
{
struct scx_exit_info *ei = &scx_exit_info;
int none = SCX_EXIT_NONE;
@@ -2264,6 +2359,14 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
goto err_disable;
}
+ scx_watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT;
+ if (ops->timeout_ms)
+ scx_watchdog_timeout = msecs_to_jiffies(ops->timeout_ms);
+
+ scx_watchdog_timestamp = jiffies;
+ queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
+ scx_watchdog_timeout / 2);
+
/*
* Lock out forks before opening the floodgate so that they don't wander
* into the operations prematurely.
@@ -2517,6 +2620,11 @@ static int bpf_scx_init_member(const struct btf_type *t,
if (ret == 0)
return -EINVAL;
return 1;
+ case offsetof(struct sched_ext_ops, timeout_ms):
+ if (*(u32 *)(udata + moff) > SCX_WATCHDOG_MAX_TIMEOUT)
+ return -E2BIG;
+ ops->timeout_ms = *(u32 *)(udata + moff);
+ return 1;
}
return 0;
@@ -2633,9 +2741,11 @@ void __init init_sched_ext_class(void)
struct rq *rq = cpu_rq(cpu);
init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
+ INIT_LIST_HEAD(&rq->scx.watchdog_list);
}
register_sysrq_key('S', &sysrq_sched_ext_reset_op);
+ INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
}
@@ -71,6 +71,8 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx);
extern const struct sched_class ext_sched_class;
extern const struct bpf_verifier_ops bpf_sched_ext_verifier_ops;
extern const struct file_operations sched_ext_fops;
+extern unsigned long scx_watchdog_timeout;
+extern unsigned long scx_watchdog_timestamp;
DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled);
#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
@@ -87,6 +89,28 @@ void scx_post_fork(struct task_struct *p);
void scx_cancel_fork(struct task_struct *p);
void init_sched_ext_class(void);
+__printf(2, 3) void scx_ops_error_type(enum scx_exit_type type,
+ const char *fmt, ...);
+#define scx_ops_error(fmt, args...) \
+ scx_ops_error_type(SCX_EXIT_ERROR, fmt, ##args)
+
+static inline void scx_notify_sched_tick(void)
+{
+ unsigned long last_check;
+
+ if (!scx_enabled())
+ return;
+
+ last_check = scx_watchdog_timestamp;
+ if (unlikely(time_after(jiffies, last_check + scx_watchdog_timeout))) {
+ u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
+
+ scx_ops_error_type(SCX_EXIT_ERROR_STALL,
+ "watchdog failed to check in for %u.%03us",
+ dur_ms / 1000, dur_ms % 1000);
+ }
+}
+
static inline const struct sched_class *next_active_class(const struct sched_class *class)
{
class++;
@@ -119,6 +143,7 @@ static inline int scx_fork(struct task_struct *p) { return 0; }
static inline void scx_post_fork(struct task_struct *p) {}
static inline void scx_cancel_fork(struct task_struct *p) {}
static inline void init_sched_ext_class(void) {}
+static inline void scx_notify_sched_tick(void) {}
#define for_each_active_class for_each_class
#define for_balance_class_range for_class_range
@@ -688,6 +688,7 @@ struct cfs_rq {
#ifdef CONFIG_SCHED_CLASS_EXT
struct scx_rq {
struct scx_dispatch_q local_dsq;
+ struct list_head watchdog_list;
u64 ops_qseq;
u64 extra_enq_flags; /* see move_task_to_local_dsq() */
u32 nr_running;
@@ -25,6 +25,8 @@
char _license[] SEC("license") = "GPL";
const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile u32 stall_user_nth;
+const volatile u32 stall_kernel_nth;
u32 test_error_cnt;
@@ -120,11 +122,20 @@ static int weight_to_idx(u32 weight)
void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
{
+ static u32 user_cnt, kernel_cnt;
struct task_ctx *tctx;
u32 pid = p->pid;
int idx = weight_to_idx(p->scx.weight);
void *ring;
+ if (p->flags & PF_KTHREAD) {
+ if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
+ return;
+ } else {
+ if (stall_user_nth && !(++user_cnt % stall_user_nth))
+ return;
+ }
+
if (test_error_cnt && !--test_error_cnt)
scx_bpf_error("test triggering error");
@@ -237,5 +248,6 @@ struct sched_ext_ops qmap_ops = {
.dispatch = (void *)qmap_dispatch,
.prep_enable = (void *)qmap_prep_enable,
.exit = (void *)qmap_exit,
+ .timeout_ms = 5000U,
.name = "qmap",
};
@@ -20,10 +20,12 @@ const char help_fmt[] =
"\n"
"See the top-level comment in .bpf.c for more details.\n"
"\n"
-"Usage: %s [-s SLICE_US] [-e COUNT]\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
+" -t COUNT Stall every COUNT'th user thread\n"
+" -T COUNT Stall every COUNT'th kernel thread\n"
" -h Display this help and exit\n";
static volatile int exit_req;
@@ -47,7 +49,7 @@ int main(int argc, char **argv)
skel = scx_qmap__open();
assert(skel);
- while ((opt = getopt(argc, argv, "s:e:tTd:h")) != -1) {
+ while ((opt = getopt(argc, argv, "s:e:t:T:d:h")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -55,6 +57,12 @@ int main(int argc, char **argv)
case 'e':
skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
break;
+ case 't':
+ skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
+ break;
+ case 'T':
+ skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
+ break;
default:
fprintf(stderr, help_fmt, basename(argv[0]));
return opt != 'h';