[02/32] sched: Encapsulate task attribute change sequence into a helper macro

Message ID	20230317213333.2174969-3-tj@kernel.org (mailing list archive)
State	Not Applicable
Headers	show Return-Path: <bpf-owner@vger.kernel.org> Sender: Tejun Heo <htejun@gmail.com> From: Tejun Heo <tj@kernel.org> To: torvalds@linux-foundation.org, mingo@redhat.com, peterz@infradead.org, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, rostedt@goodmis.org, bsegall@google.com, mgorman@suse.de, bristot@redhat.com, vschneid@redhat.com, ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org, martin.lau@kernel.org, joshdon@google.com, brho@google.com, pjt@google.com, derkling@google.com, haoluo@google.com, dvernet@meta.com, dschatzberg@meta.com, dskarlat@cs.cmu.edu, riel@surriel.com Cc: linux-kernel@vger.kernel.org, bpf@vger.kernel.org, kernel-team@meta.com, Tejun Heo <tj@kernel.org> Subject: [PATCH 02/32] sched: Encapsulate task attribute change sequence into a helper macro Date: Fri, 17 Mar 2023 11:33:03 -1000 Message-Id: <20230317213333.2174969-3-tj@kernel.org> In-Reply-To: <20230317213333.2174969-1-tj@kernel.org> References: <20230317213333.2174969-1-tj@kernel.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	[01/32] cgroup: Implement cgroup_show_cftypes() \| expand [01/32] cgroup: Implement cgroup_show_cftypes() [02/32] sched: Encapsulate task attribute change sequence into a helper macro [03/32] sched: Restructure sched_class order sanity checks in sched_init() [04/32] sched: Allow sched_cgroup_fork() to fail and introduce sched_cancel_fork() [05/32] sched: Add sched_class->reweight_task() [06/32] sched: Add sched_class->switching_to() and expose check_class_changing/changed() [07/32] sched: Factor out cgroup weight conversion functions [08/32] sched: Expose css_tg(), __setscheduler_prio() and SCHED_CHANGE_BLOCK() [09/32] sched: Enumerate CPU cgroup file types [10/32] sched: Add @reason to sched_class->rq_{on\|off}line() [11/32] sched: Add normal_policy() [12/32] sched_ext: Add boilerplate for extensible scheduler class [14/32] sched_ext: Add scx_example_simple and scx_example_qmap example schedulers [15/32] sched_ext: Add sysrq-S which disables the BPF scheduler [16/32] sched_ext: Implement runnable task stall watchdog [17/32] sched_ext: Allow BPF schedulers to disallow specific tasks from joining SCHED_EXT [18/32] sched_ext: Allow BPF schedulers to switch all eligible tasks into sched_ext [19/32] sched_ext: Implement scx_bpf_kick_cpu() and task preemption support [20/32] sched_ext: Make watchdog handle ops.dispatch() looping stall [21/32] sched_ext: Add task state tracking operations [22/32] sched_ext: Implement tickless support [23/32] sched_ext: Track tasks that are subjects of the in-flight SCX operation [24/32] sched_ext: Add cgroup support [25/32] sched_ext: Implement SCX_KICK_WAIT [26/32] sched_ext: Implement sched_ext_ops.cpu_acquire/release() [27/32] sched_ext: Implement sched_ext_ops.cpu_online/offline() [28/32] sched_ext: Implement core-sched support [29/32] sched_ext: Add vtime-ordered priority queue to dispatch_q's [30/32] sched_ext: Documentation: scheduler: Document extensible scheduler class [31/32] sched_ext: Add a basic, userland vruntime scheduler [32/32] sched_ext: Add a rust userspace hybrid example scheduler

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index af017e038b48..fb080ca54d80 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2096,6 +2096,76 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dequeue_task(rq, p, flags); } +struct sched_change_guard { + struct task_struct *p; + struct rq *rq; + bool queued; + bool running; + bool done; +}; + +static struct sched_change_guard +sched_change_guard_init(struct rq *rq, struct task_struct *p, int flags) +{ + struct sched_change_guard cg = { + .rq = rq, + .p = p, + .queued = task_on_rq_queued(p), + .running = task_current(rq, p), + }; + + if (cg.queued) { + /* + * __kthread_bind() may call this on blocked tasks without + * holding rq->lock through __do_set_cpus_allowed(). Assert @rq + * locked iff @p is queued. + */ + lockdep_assert_rq_held(rq); + dequeue_task(rq, p, flags); + } + if (cg.running) + put_prev_task(rq, p); + + return cg; +} + +static void sched_change_guard_fini(struct sched_change_guard *cg, int flags) +{ + if (cg->queued) + enqueue_task(cg->rq, cg->p, flags | ENQUEUE_NOCLOCK); + if (cg->running) + set_next_task(cg->rq, cg->p); + cg->done = true; +} + +/** + * SCHED_CHANGE_BLOCK - Nested block for task attribute updates + * @__rq: Runqueue the target task belongs to + * @__p: Target task + * @__flags: DEQUEUE/ENQUEUE_* flags + * + * A task may need to be dequeued and put_prev_task'd for attribute updates and + * set_next_task'd and re-enqueued afterwards. This helper defines a nested + * block which automatically handles these preparation and cleanup operations. + * + * SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) { + * update_attribute(p); + * ... + * } + * + * If @__flags is a variable, the variable may be updated in the block body and + * the updated value will be used when re-enqueueing @p. + * + * If %DEQUEUE_NOCLOCK is specified, the caller is responsible for calling + * update_rq_clock() beforehand. Otherwise, the rq clock is automatically + * updated iff the task needs to be dequeued and re-enqueued. Only the former + * case guarantees that the rq clock is up-to-date inside and after the block. + */ +#define SCHED_CHANGE_BLOCK(__rq, __p, __flags) \ + for (struct sched_change_guard __cg = \ + sched_change_guard_init(__rq, __p, __flags); \ + !__cg.done; sched_change_guard_fini(&__cg, __flags)) + static inline int __normal_prio(int policy, int rt_prio, int nice) { int prio; @@ -2554,7 +2624,6 @@ static void __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) { struct rq *rq = task_rq(p); - bool queued, running; /* * This here violates the locking rules for affinity, since we're only @@ -2573,26 +2642,9 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) else lockdep_assert_held(&p->pi_lock); - queued = task_on_rq_queued(p); - running = task_current(rq, p); - - if (queued) { - /* - * Because __kthread_bind() calls this on blocked tasks without - * holding rq->lock. - */ - lockdep_assert_rq_held(rq); - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); + SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) { + p->sched_class->set_cpus_allowed(p, ctx); } - if (running) - put_prev_task(rq, p); - - p->sched_class->set_cpus_allowed(p, ctx); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); } /* @@ -6989,7 +7041,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) */ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { - int prio, oldprio, queued, running, queue_flag = + int prio, oldprio, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class; struct rq_flags rf; @@ -7049,49 +7101,39 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) queue_flag &= ~DEQUEUE_MOVE; prev_class = p->sched_class; - queued = task_on_rq_queued(p); - running = task_current(rq, p); - if (queued) - dequeue_task(rq, p, queue_flag); - if (running) - put_prev_task(rq, p); - - /* - * Boosting condition are: - * 1. -rt task is running and holds mutex A - * --> -dl task blocks on mutex A - * - * 2. -dl task is running and holds mutex A - * --> -dl task blocks on mutex A and could preempt the - * running task - */ - if (dl_prio(prio)) { - if (!dl_prio(p->normal_prio) || - (pi_task && dl_prio(pi_task->prio) && - dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.pi_se = pi_task->dl.pi_se; - queue_flag |= ENQUEUE_REPLENISH; + SCHED_CHANGE_BLOCK(rq, p, queue_flag) { + /* + * Boosting condition are: + * 1. -rt task is running and holds mutex A + * --> -dl task blocks on mutex A + * + * 2. -dl task is running and holds mutex A + * --> -dl task blocks on mutex A and could preempt the + * running task + */ + if (dl_prio(prio)) { + if (!dl_prio(p->normal_prio) || + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { + p->dl.pi_se = pi_task->dl.pi_se; + queue_flag |= ENQUEUE_REPLENISH; + } else { + p->dl.pi_se = &p->dl; + } + } else if (rt_prio(prio)) { + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (oldprio < prio) + queue_flag |= ENQUEUE_HEAD; } else { - p->dl.pi_se = &p->dl; + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (rt_prio(oldprio)) + p->rt.timeout = 0; } - } else if (rt_prio(prio)) { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (oldprio < prio) - queue_flag |= ENQUEUE_HEAD; - } else { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (rt_prio(oldprio)) - p->rt.timeout = 0; - } - - __setscheduler_prio(p, prio); - if (queued) - enqueue_task(rq, p, queue_flag); - if (running) - set_next_task(rq, p); + __setscheduler_prio(p, prio); + } check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -7113,7 +7155,6 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) void set_user_nice(struct task_struct *p, long nice) { - bool queued, running; int old_prio; struct rq_flags rf; struct rq *rq; @@ -7137,22 +7178,13 @@ void set_user_nice(struct task_struct *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - queued = task_on_rq_queued(p); - running = task_current(rq, p); - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - if (running) - put_prev_task(rq, p); - - p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p, true); - old_prio = p->prio; - p->prio = effective_prio(p); - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); + SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK) { + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p, true); + old_prio = p->prio; + p->prio = effective_prio(p); + } /* * If the task increased its priority or is running and @@ -7536,7 +7568,7 @@ static int __sched_setscheduler(struct task_struct *p, bool user, bool pi) { int oldpolicy = -1, policy = attr->sched_policy; - int retval, oldprio, newprio, queued, running; + int retval, oldprio, newprio; const struct sched_class *prev_class; struct balance_callback *head; struct rq_flags rf; @@ -7701,33 +7733,22 @@ static int __sched_setscheduler(struct task_struct *p, queue_flags &= ~DEQUEUE_MOVE; } - queued = task_on_rq_queued(p); - running = task_current(rq, p); - if (queued) - dequeue_task(rq, p, queue_flags); - if (running) - put_prev_task(rq, p); - - prev_class = p->sched_class; + SCHED_CHANGE_BLOCK(rq, p, queue_flags) { + prev_class = p->sched_class; - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { - __setscheduler_params(p, attr); - __setscheduler_prio(p, newprio); - } - __setscheduler_uclamp(p, attr); + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + __setscheduler_prio(p, newprio); + } + __setscheduler_uclamp(p, attr); - if (queued) { /* * We enqueue to tail when the priority of a task is * increased (user space view). */ if (oldprio < p->prio) queue_flags |= ENQUEUE_HEAD; - - enqueue_task(rq, p, queue_flags); } - if (running) - set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); @@ -9250,25 +9271,15 @@ int migrate_task_to(struct task_struct *p, int target_cpu) */ void sched_setnuma(struct task_struct *p, int nid) { - bool queued, running; struct rq_flags rf; struct rq *rq; rq = task_rq_lock(p, &rf); - queued = task_on_rq_queued(p); - running = task_current(rq, p); - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); - if (running) - put_prev_task(rq, p); - - p->numa_preferred_nid = nid; + SCHED_CHANGE_BLOCK(rq, p, DEQUEUE_SAVE) { + p->numa_preferred_nid = nid; + } - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -10360,35 +10371,24 @@ static void sched_change_group(struct task_struct *tsk) */ void sched_move_task(struct task_struct *tsk) { - int queued, running, queue_flags = - DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq_flags rf; struct rq *rq; rq = task_rq_lock(tsk, &rf); update_rq_clock(rq); - running = task_current(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, queue_flags); - if (running) - put_prev_task(rq, tsk); - - sched_change_group(tsk); + SCHED_CHANGE_BLOCK(rq, tsk, + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) { + sched_change_group(tsk); + } - if (queued) - enqueue_task(rq, tsk, queue_flags); - if (running) { - set_next_task(rq, tsk); - /* - * After changing group, the running task may have joined a - * throttled one but it's still the running task. Trigger a - * resched to make sure that task can still run. - */ + /* + * After changing group, the running task may have joined a throttled + * one but it's still the running task. Trigger a resched to make sure + * that task can still run. + */ + if (task_current(rq, tsk)) resched_curr(rq); - } task_rq_unlock(rq, tsk, &rf); }

[02/32] sched: Encapsulate task attribute change sequence into a helper macro

Commit Message

Patch