diff mbox series

[sched_ext/for-6.15,v3,4/5] sched_ext: Removed mask-based runtime restrictions on calling kfuncs in different contexts

Message ID AM6PR03MB5080BDE038C8E8E89996F30E99C22@AM6PR03MB5080.eurprd03.prod.outlook.com (mailing list archive)
State Not Applicable
Headers show
Series bpf, sched_ext: Make kfunc filters support struct_ops context to reduce runtime overhead | expand

Checks

Context Check Description
netdev/tree_selection success Guessing tree name failed - patch did not apply, async

Commit Message

Juntong Deng Feb. 26, 2025, 7:28 p.m. UTC
Currently, kfunc filters already support filtering based on struct_ops
context information.

The BPF verifier can check context-sensitive kfuncs before the SCX
program is run, avoiding runtime overhead.

Therefore we no longer need mask-based runtime restrictions.

This patch removes the mask-based runtime restrictions.

Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
---
 include/linux/sched/ext.h |  24 ----
 kernel/sched/ext.c        | 227 ++++++++------------------------------
 kernel/sched/ext_idle.c   |   5 +-
 3 files changed, 50 insertions(+), 206 deletions(-)

Comments

Andrea Righi Feb. 27, 2025, 4:24 p.m. UTC | #1
On Wed, Feb 26, 2025 at 07:28:19PM +0000, Juntong Deng wrote:
> Currently, kfunc filters already support filtering based on struct_ops
> context information.
> 
> The BPF verifier can check context-sensitive kfuncs before the SCX
> program is run, avoiding runtime overhead.
> 
> Therefore we no longer need mask-based runtime restrictions.
> 
> This patch removes the mask-based runtime restrictions.

You may have missed scx_prio_less(), that is still using SCX_KF_REST:

kernel/sched/ext.c: In function ‘scx_prio_less’:
kernel/sched/ext.c:1171:27: error: ‘struct sched_ext_ops’ has no member named ‘SCX_KF_REST’
 1171 |         __typeof__(scx_ops.op(task0, task1, ##args)) __ret;                     \
      |                           ^

I think you just need to add:

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4a4713c3af67b..51c13b8c27743 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3302,7 +3302,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 	 * verifier.
 	 */
 	if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
-		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
+		return SCX_CALL_OP_2TASKS_RET(core_sched_before,
 					      (struct task_struct *)a,
 					      (struct task_struct *)b);
 	else

-Andrea

> 
> Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
> ---
>  include/linux/sched/ext.h |  24 ----
>  kernel/sched/ext.c        | 227 ++++++++------------------------------
>  kernel/sched/ext_idle.c   |   5 +-
>  3 files changed, 50 insertions(+), 206 deletions(-)
> 
> diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
> index f7545430a548..9980d6b55c84 100644
> --- a/include/linux/sched/ext.h
> +++ b/include/linux/sched/ext.h
> @@ -96,29 +96,6 @@ enum scx_ent_dsq_flags {
>  	SCX_TASK_DSQ_ON_PRIQ	= 1 << 0, /* task is queued on the priority queue of a dsq */
>  };
>  
> -/*
> - * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
> - * everywhere and the following bits track which kfunc sets are currently
> - * allowed for %current. This simple per-task tracking works because SCX ops
> - * nest in a limited way. BPF will likely implement a way to allow and disallow
> - * kfuncs depending on the calling context which will replace this manual
> - * mechanism. See scx_kf_allow().
> - */
> -enum scx_kf_mask {
> -	SCX_KF_UNLOCKED		= 0,	  /* sleepable and not rq locked */
> -	/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
> -	SCX_KF_CPU_RELEASE	= 1 << 0, /* ops.cpu_release() */
> -	/* ops.dequeue (in REST) may be nested inside DISPATCH */
> -	SCX_KF_DISPATCH		= 1 << 1, /* ops.dispatch() */
> -	SCX_KF_ENQUEUE		= 1 << 2, /* ops.enqueue() and ops.select_cpu() */
> -	SCX_KF_SELECT_CPU	= 1 << 3, /* ops.select_cpu() */
> -	SCX_KF_REST		= 1 << 4, /* other rq-locked operations */
> -
> -	__SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
> -				  SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
> -	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
> -};
> -
>  enum scx_dsq_lnode_flags {
>  	SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0,
>  
> @@ -147,7 +124,6 @@ struct sched_ext_entity {
>  	s32			sticky_cpu;
>  	s32			holding_cpu;
>  	s32			selected_cpu;
> -	u32			kf_mask;	/* see scx_kf_mask above */
>  	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
>  	atomic_long_t		ops_state;
>  
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index c337f6206ae5..7dc5f11be66b 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -1115,19 +1115,6 @@ static long jiffies_delta_msecs(unsigned long at, unsigned long now)
>  		return -(long)jiffies_to_msecs(now - at);
>  }
>  
> -/* if the highest set bit is N, return a mask with bits [N+1, 31] set */
> -static u32 higher_bits(u32 flags)
> -{
> -	return ~((1 << fls(flags)) - 1);
> -}
> -
> -/* return the mask with only the highest bit set */
> -static u32 highest_bit(u32 flags)
> -{
> -	int bit = fls(flags);
> -	return ((u64)1 << bit) >> 1;
> -}
> -
>  static bool u32_before(u32 a, u32 b)
>  {
>  	return (s32)(a - b) < 0;
> @@ -1143,51 +1130,12 @@ static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
>  	return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
>  }
>  
> -/*
> - * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
> - * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
> - * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
> - * whether it's running from an allowed context.
> - *
> - * @mask is constant, always inline to cull the mask calculations.
> - */
> -static __always_inline void scx_kf_allow(u32 mask)
> -{
> -	/* nesting is allowed only in increasing scx_kf_mask order */
> -	WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
> -		  "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
> -		  current->scx.kf_mask, mask);
> -	current->scx.kf_mask |= mask;
> -	barrier();
> -}
> -
> -static void scx_kf_disallow(u32 mask)
> -{
> -	barrier();
> -	current->scx.kf_mask &= ~mask;
> -}
> -
> -#define SCX_CALL_OP(mask, op, args...)						\
> -do {										\
> -	if (mask) {								\
> -		scx_kf_allow(mask);						\
> -		scx_ops.op(args);						\
> -		scx_kf_disallow(mask);						\
> -	} else {								\
> -		scx_ops.op(args);						\
> -	}									\
> -} while (0)
> +#define SCX_CALL_OP(op, args...)	scx_ops.op(args)
>  
> -#define SCX_CALL_OP_RET(mask, op, args...)					\
> +#define SCX_CALL_OP_RET(op, args...)						\
>  ({										\
>  	__typeof__(scx_ops.op(args)) __ret;					\
> -	if (mask) {								\
> -		scx_kf_allow(mask);						\
> -		__ret = scx_ops.op(args);					\
> -		scx_kf_disallow(mask);						\
> -	} else {								\
> -		__ret = scx_ops.op(args);					\
> -	}									\
> +	__ret = scx_ops.op(args);						\
>  	__ret;									\
>  })
>  
> @@ -1202,74 +1150,36 @@ do {										\
>   * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
>   * the specific task.
>   */
> -#define SCX_CALL_OP_TASK(mask, op, task, args...)				\
> +#define SCX_CALL_OP_TASK(op, task, args...)					\
>  do {										\
> -	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
>  	current->scx.kf_tasks[0] = task;					\
> -	SCX_CALL_OP(mask, op, task, ##args);					\
> +	SCX_CALL_OP(op, task, ##args);						\
>  	current->scx.kf_tasks[0] = NULL;					\
>  } while (0)
>  
> -#define SCX_CALL_OP_TASK_RET(mask, op, task, args...)				\
> +#define SCX_CALL_OP_TASK_RET(op, task, args...)					\
>  ({										\
>  	__typeof__(scx_ops.op(task, ##args)) __ret;				\
> -	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
>  	current->scx.kf_tasks[0] = task;					\
> -	__ret = SCX_CALL_OP_RET(mask, op, task, ##args);			\
> +	__ret = SCX_CALL_OP_RET(op, task, ##args);				\
>  	current->scx.kf_tasks[0] = NULL;					\
>  	__ret;									\
>  })
>  
> -#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...)			\
> +#define SCX_CALL_OP_2TASKS_RET(op, task0, task1, args...)			\
>  ({										\
>  	__typeof__(scx_ops.op(task0, task1, ##args)) __ret;			\
> -	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
>  	current->scx.kf_tasks[0] = task0;					\
>  	current->scx.kf_tasks[1] = task1;					\
> -	__ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args);		\
> +	__ret = SCX_CALL_OP_RET(op, task0, task1, ##args);			\
>  	current->scx.kf_tasks[0] = NULL;					\
>  	current->scx.kf_tasks[1] = NULL;					\
>  	__ret;									\
>  })
>  
> -/* @mask is constant, always inline to cull unnecessary branches */
> -static __always_inline bool scx_kf_allowed(u32 mask)
> -{
> -	if (unlikely(!(current->scx.kf_mask & mask))) {
> -		scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
> -			      mask, current->scx.kf_mask);
> -		return false;
> -	}
> -
> -	/*
> -	 * Enforce nesting boundaries. e.g. A kfunc which can be called from
> -	 * DISPATCH must not be called if we're running DEQUEUE which is nested
> -	 * inside ops.dispatch(). We don't need to check boundaries for any
> -	 * blocking kfuncs as the verifier ensures they're only called from
> -	 * sleepable progs.
> -	 */
> -	if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
> -		     (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
> -		scx_ops_error("cpu_release kfunc called from a nested operation");
> -		return false;
> -	}
> -
> -	if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
> -		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
> -		scx_ops_error("dispatch kfunc called from a nested operation");
> -		return false;
> -	}
> -
> -	return true;
> -}
> -
>  /* see SCX_CALL_OP_TASK() */
> -static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
> -							struct task_struct *p)
> +static __always_inline bool scx_kf_allowed_on_arg_tasks(struct task_struct *p)
>  {
> -	if (!scx_kf_allowed(mask))
> -		return false;
> -
>  	if (unlikely((p != current->scx.kf_tasks[0] &&
>  		      p != current->scx.kf_tasks[1]))) {
>  		scx_ops_error("called on a task not being operated on");
> @@ -1279,11 +1189,6 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
>  	return true;
>  }
>  
> -static bool scx_kf_allowed_if_unlocked(void)
> -{
> -	return !current->scx.kf_mask;
> -}
> -
>  /**
>   * nldsq_next_task - Iterate to the next task in a non-local DSQ
>   * @dsq: user dsq being iterated
> @@ -2219,7 +2124,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
>  	WARN_ON_ONCE(*ddsp_taskp);
>  	*ddsp_taskp = p;
>  
> -	SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
> +	SCX_CALL_OP_TASK(enqueue, p, enq_flags);
>  
>  	*ddsp_taskp = NULL;
>  	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
> @@ -2316,7 +2221,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
>  	add_nr_running(rq, 1);
>  
>  	if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
> -		SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
> +		SCX_CALL_OP_TASK(runnable, p, enq_flags);
>  
>  	if (enq_flags & SCX_ENQ_WAKEUP)
>  		touch_core_sched(rq, p);
> @@ -2351,7 +2256,7 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
>  		BUG();
>  	case SCX_OPSS_QUEUED:
>  		if (SCX_HAS_OP(dequeue))
> -			SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
> +			SCX_CALL_OP_TASK(dequeue, p, deq_flags);
>  
>  		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
>  					    SCX_OPSS_NONE))
> @@ -2400,11 +2305,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
>  	 */
>  	if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
>  		update_curr_scx(rq);
> -		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
> +		SCX_CALL_OP_TASK(stopping, p, false);
>  	}
>  
>  	if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
> -		SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
> +		SCX_CALL_OP_TASK(quiescent, p, deq_flags);
>  
>  	if (deq_flags & SCX_DEQ_SLEEP)
>  		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
> @@ -2424,7 +2329,7 @@ static void yield_task_scx(struct rq *rq)
>  	struct task_struct *p = rq->curr;
>  
>  	if (SCX_HAS_OP(yield))
> -		SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL);
> +		SCX_CALL_OP_2TASKS_RET(yield, p, NULL);
>  	else
>  		p->scx.slice = 0;
>  }
> @@ -2434,7 +2339,7 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
>  	struct task_struct *from = rq->curr;
>  
>  	if (SCX_HAS_OP(yield))
> -		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to);
> +		return SCX_CALL_OP_2TASKS_RET(yield, from, to);
>  	else
>  		return false;
>  }
> @@ -2992,7 +2897,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
>  		 * emitted in switch_class().
>  		 */
>  		if (SCX_HAS_OP(cpu_acquire))
> -			SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL);
> +			SCX_CALL_OP(cpu_acquire, cpu_of(rq), NULL);
>  		rq->scx.cpu_released = false;
>  	}
>  
> @@ -3037,8 +2942,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
>  	do {
>  		dspc->nr_tasks = 0;
>  
> -		SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
> -			    prev_on_scx ? prev : NULL);
> +		SCX_CALL_OP(dispatch, cpu_of(rq), prev_on_scx ? prev : NULL);
>  
>  		flush_dispatch_buf(rq);
>  
> @@ -3159,7 +3063,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
>  
>  	/* see dequeue_task_scx() on why we skip when !QUEUED */
>  	if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
> -		SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
> +		SCX_CALL_OP_TASK(running, p);
>  
>  	clr_task_runnable(p, true);
>  
> @@ -3240,8 +3144,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
>  				.task = next,
>  			};
>  
> -			SCX_CALL_OP(SCX_KF_CPU_RELEASE,
> -				    cpu_release, cpu_of(rq), &args);
> +			SCX_CALL_OP(cpu_release, cpu_of(rq), &args);
>  		}
>  		rq->scx.cpu_released = true;
>  	}
> @@ -3254,7 +3157,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
>  
>  	/* see dequeue_task_scx() on why we skip when !QUEUED */
>  	if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
> -		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
> +		SCX_CALL_OP_TASK(stopping, p, true);
>  
>  	if (p->scx.flags & SCX_TASK_QUEUED) {
>  		set_task_runnable(rq, p);
> @@ -3428,8 +3331,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
>  		WARN_ON_ONCE(*ddsp_taskp);
>  		*ddsp_taskp = p;
>  
> -		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
> -					   select_cpu, p, prev_cpu, wake_flags);
> +		cpu = SCX_CALL_OP_TASK_RET(select_cpu, p, prev_cpu, wake_flags);
>  		p->scx.selected_cpu = cpu;
>  		*ddsp_taskp = NULL;
>  		if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
> @@ -3473,8 +3375,7 @@ static void set_cpus_allowed_scx(struct task_struct *p,
>  	 * designation pointless. Cast it away when calling the operation.
>  	 */
>  	if (SCX_HAS_OP(set_cpumask))
> -		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
> -				 (struct cpumask *)p->cpus_ptr);
> +		SCX_CALL_OP_TASK(set_cpumask, p, (struct cpumask *)p->cpus_ptr);
>  }
>  
>  static void handle_hotplug(struct rq *rq, bool online)
> @@ -3487,9 +3388,9 @@ static void handle_hotplug(struct rq *rq, bool online)
>  		scx_idle_update_selcpu_topology(&scx_ops);
>  
>  	if (online && SCX_HAS_OP(cpu_online))
> -		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
> +		SCX_CALL_OP(cpu_online, cpu);
>  	else if (!online && SCX_HAS_OP(cpu_offline))
> -		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu);
> +		SCX_CALL_OP(cpu_offline, cpu);
>  	else
>  		scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
>  			     "cpu %d going %s, exiting scheduler", cpu,
> @@ -3593,7 +3494,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
>  		curr->scx.slice = 0;
>  		touch_core_sched(rq, curr);
>  	} else if (SCX_HAS_OP(tick)) {
> -		SCX_CALL_OP(SCX_KF_REST, tick, curr);
> +		SCX_CALL_OP(tick, curr);
>  	}
>  
>  	if (!curr->scx.slice)
> @@ -3670,7 +3571,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
>  			.fork = fork,
>  		};
>  
> -		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args);
> +		ret = SCX_CALL_OP_RET(init_task, p, &args);
>  		if (unlikely(ret)) {
>  			ret = ops_sanitize_err("init_task", ret);
>  			return ret;
> @@ -3727,11 +3628,11 @@ static void scx_ops_enable_task(struct task_struct *p)
>  	p->scx.weight = sched_weight_to_cgroup(weight);
>  
>  	if (SCX_HAS_OP(enable))
> -		SCX_CALL_OP_TASK(SCX_KF_REST, enable, p);
> +		SCX_CALL_OP_TASK(enable, p);
>  	scx_set_task_state(p, SCX_TASK_ENABLED);
>  
>  	if (SCX_HAS_OP(set_weight))
> -		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
> +		SCX_CALL_OP_TASK(set_weight, p, p->scx.weight);
>  }
>  
>  static void scx_ops_disable_task(struct task_struct *p)
> @@ -3740,7 +3641,7 @@ static void scx_ops_disable_task(struct task_struct *p)
>  	WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
>  
>  	if (SCX_HAS_OP(disable))
> -		SCX_CALL_OP(SCX_KF_REST, disable, p);
> +		SCX_CALL_OP(disable, p);
>  	scx_set_task_state(p, SCX_TASK_READY);
>  }
>  
> @@ -3769,7 +3670,7 @@ static void scx_ops_exit_task(struct task_struct *p)
>  	}
>  
>  	if (SCX_HAS_OP(exit_task))
> -		SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args);
> +		SCX_CALL_OP(exit_task, p, &args);
>  	scx_set_task_state(p, SCX_TASK_NONE);
>  }
>  
> @@ -3878,7 +3779,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
>  
>  	p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
>  	if (SCX_HAS_OP(set_weight))
> -		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
> +		SCX_CALL_OP_TASK(set_weight, p, p->scx.weight);
>  }
>  
>  static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
> @@ -3894,8 +3795,7 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
>  	 * different scheduler class. Keep the BPF scheduler up-to-date.
>  	 */
>  	if (SCX_HAS_OP(set_cpumask))
> -		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
> -				 (struct cpumask *)p->cpus_ptr);
> +		SCX_CALL_OP_TASK(set_cpumask, p, (struct cpumask *)p->cpus_ptr);
>  }
>  
>  static void switched_from_scx(struct rq *rq, struct task_struct *p)
> @@ -3987,8 +3887,7 @@ int scx_tg_online(struct task_group *tg)
>  			struct scx_cgroup_init_args args =
>  				{ .weight = tg->scx_weight };
>  
> -			ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
> -					      tg->css.cgroup, &args);
> +			ret = SCX_CALL_OP_RET(cgroup_init, tg->css.cgroup, &args);
>  			if (ret)
>  				ret = ops_sanitize_err("cgroup_init", ret);
>  		}
> @@ -4009,7 +3908,7 @@ void scx_tg_offline(struct task_group *tg)
>  	percpu_down_read(&scx_cgroup_rwsem);
>  
>  	if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
> -		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup);
> +		SCX_CALL_OP(cgroup_exit, tg->css.cgroup);
>  	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
>  
>  	percpu_up_read(&scx_cgroup_rwsem);
> @@ -4042,8 +3941,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
>  			continue;
>  
>  		if (SCX_HAS_OP(cgroup_prep_move)) {
> -			ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move,
> -					      p, from, css->cgroup);
> +			ret = SCX_CALL_OP_RET(cgroup_prep_move, p, from, css->cgroup);
>  			if (ret)
>  				goto err;
>  		}
> @@ -4056,8 +3954,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
>  err:
>  	cgroup_taskset_for_each(p, css, tset) {
>  		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
> -			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
> -				    p->scx.cgrp_moving_from, css->cgroup);
> +			SCX_CALL_OP(cgroup_cancel_move, p, p->scx.cgrp_moving_from, css->cgroup);
>  		p->scx.cgrp_moving_from = NULL;
>  	}
>  
> @@ -4075,8 +3972,7 @@ void scx_cgroup_move_task(struct task_struct *p)
>  	 * cgrp_moving_from set.
>  	 */
>  	if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
> -		SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
> -			p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
> +		SCX_CALL_OP_TASK(cgroup_move, p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
>  	p->scx.cgrp_moving_from = NULL;
>  }
>  
> @@ -4095,8 +3991,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
>  
>  	cgroup_taskset_for_each(p, css, tset) {
>  		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
> -			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
> -				    p->scx.cgrp_moving_from, css->cgroup);
> +			SCX_CALL_OP(cgroup_cancel_move, p, p->scx.cgrp_moving_from, css->cgroup);
>  		p->scx.cgrp_moving_from = NULL;
>  	}
>  out_unlock:
> @@ -4109,8 +4004,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
>  
>  	if (scx_cgroup_enabled && tg->scx_weight != weight) {
>  		if (SCX_HAS_OP(cgroup_set_weight))
> -			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
> -				    tg_cgrp(tg), weight);
> +			SCX_CALL_OP(cgroup_set_weight, tg_cgrp(tg), weight);
>  		tg->scx_weight = weight;
>  	}
>  
> @@ -4300,7 +4194,7 @@ static void scx_cgroup_exit(void)
>  			continue;
>  		rcu_read_unlock();
>  
> -		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
> +		SCX_CALL_OP(cgroup_exit, css->cgroup);
>  
>  		rcu_read_lock();
>  		css_put(css);
> @@ -4343,8 +4237,7 @@ static int scx_cgroup_init(void)
>  			continue;
>  		rcu_read_unlock();
>  
> -		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
> -				      css->cgroup, &args);
> +		ret = SCX_CALL_OP_RET(cgroup_init, css->cgroup, &args);
>  		if (ret) {
>  			css_put(css);
>  			scx_ops_error("ops.cgroup_init() failed (%d)", ret);
> @@ -4840,7 +4733,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
>  	}
>  
>  	if (scx_ops.exit)
> -		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
> +		SCX_CALL_OP(exit, ei);
>  
>  	cancel_delayed_work_sync(&scx_watchdog_work);
>  
> @@ -5047,7 +4940,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
>  
>  	if (SCX_HAS_OP(dump_task)) {
>  		ops_dump_init(s, "    ");
> -		SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
> +		SCX_CALL_OP(dump_task, dctx, p);
>  		ops_dump_exit();
>  	}
>  
> @@ -5094,7 +4987,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
>  
>  	if (SCX_HAS_OP(dump)) {
>  		ops_dump_init(&s, "");
> -		SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
> +		SCX_CALL_OP(dump, &dctx);
>  		ops_dump_exit();
>  	}
>  
> @@ -5151,7 +5044,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
>  		used = seq_buf_used(&ns);
>  		if (SCX_HAS_OP(dump_cpu)) {
>  			ops_dump_init(&ns, "  ");
> -			SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
> +			SCX_CALL_OP(dump_cpu, &dctx, cpu, idle);
>  			ops_dump_exit();
>  		}
>  
> @@ -5405,7 +5298,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>  	cpus_read_lock();
>  
>  	if (scx_ops.init) {
> -		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
> +		ret = SCX_CALL_OP_RET(init);
>  		if (ret) {
>  			ret = ops_sanitize_err("init", ret);
>  			cpus_read_unlock();
> @@ -6146,9 +6039,6 @@ void __init init_sched_ext_class(void)
>   */
>  static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
>  {
> -	if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
> -		return false;
> -
>  	lockdep_assert_irqs_disabled();
>  
>  	if (unlikely(!p)) {
> @@ -6310,9 +6200,6 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
>  	bool in_balance;
>  	unsigned long flags;
>  
> -	if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH))
> -		return false;
> -
>  	/*
>  	 * Can be called from either ops.dispatch() locking this_rq() or any
>  	 * context where no rq lock is held. If latter, lock @p's task_rq which
> @@ -6395,9 +6282,6 @@ __bpf_kfunc_start_defs();
>   */
>  __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
>  {
> -	if (!scx_kf_allowed(SCX_KF_DISPATCH))
> -		return 0;
> -
>  	return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
>  }
>  
> @@ -6411,9 +6295,6 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
>  {
>  	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
>  
> -	if (!scx_kf_allowed(SCX_KF_DISPATCH))
> -		return;
> -
>  	if (dspc->cursor > 0)
>  		dspc->cursor--;
>  	else
> @@ -6439,9 +6320,6 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
>  	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
>  	struct scx_dispatch_q *dsq;
>  
> -	if (!scx_kf_allowed(SCX_KF_DISPATCH))
> -		return false;
> -
>  	flush_dispatch_buf(dspc->rq);
>  
>  	dsq = find_user_dsq(dsq_id);
> @@ -6632,9 +6510,6 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
>  	struct rq *rq;
>  	struct task_struct *p, *n;
>  
> -	if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
> -		return 0;
> -
>  	rq = cpu_rq(smp_processor_id());
>  	lockdep_assert_rq_held(rq);
>  
> @@ -7239,7 +7114,7 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
>  	struct task_group *tg = p->sched_task_group;
>  	struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
>  
> -	if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
> +	if (!scx_kf_allowed_on_arg_tasks(p))
>  		goto out;
>  
>  	cgrp = tg_cgrp(tg);
> @@ -7479,10 +7354,6 @@ static int __init scx_init(void)
>  	 *
>  	 * Some kfuncs are context-sensitive and can only be called from
>  	 * specific SCX ops. They are grouped into BTF sets accordingly.
> -	 * Unfortunately, BPF currently doesn't have a way of enforcing such
> -	 * restrictions. Eventually, the verifier should be able to enforce
> -	 * them. For now, register them the same and make each kfunc explicitly
> -	 * check using scx_kf_allowed().
>  	 */
>  	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
>  					     &scx_kfunc_set_ops_context_sensitive)) ||
> diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
> index efb6077810d8..e241935021eb 100644
> --- a/kernel/sched/ext_idle.c
> +++ b/kernel/sched/ext_idle.c
> @@ -658,7 +658,7 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
>  	 * managed by put_prev_task_idle()/set_next_task_idle().
>  	 */
>  	if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
> -		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
> +		SCX_CALL_OP(update_idle, cpu_of(rq), idle);
>  
>  	/*
>  	 * Update the idle masks:
> @@ -803,9 +803,6 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
>  	if (!check_builtin_idle_enabled())
>  		goto prev_cpu;
>  
> -	if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
> -		goto prev_cpu;
> -
>  #ifdef CONFIG_SMP
>  	return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
>  #endif
> -- 
> 2.39.5
>
Juntong Deng Feb. 27, 2025, 4:49 p.m. UTC | #2
On 2025/2/27 16:24, Andrea Righi wrote:
> On Wed, Feb 26, 2025 at 07:28:19PM +0000, Juntong Deng wrote:
>> Currently, kfunc filters already support filtering based on struct_ops
>> context information.
>>
>> The BPF verifier can check context-sensitive kfuncs before the SCX
>> program is run, avoiding runtime overhead.
>>
>> Therefore we no longer need mask-based runtime restrictions.
>>
>> This patch removes the mask-based runtime restrictions.
> 
> You may have missed scx_prio_less(), that is still using SCX_KF_REST:
> 
> kernel/sched/ext.c: In function ‘scx_prio_less’:
> kernel/sched/ext.c:1171:27: error: ‘struct sched_ext_ops’ has no member named ‘SCX_KF_REST’
>   1171 |         __typeof__(scx_ops.op(task0, task1, ##args)) __ret;                     \
>        |                           ^
> 
> I think you just need to add:
> 
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 4a4713c3af67b..51c13b8c27743 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -3302,7 +3302,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
>   	 * verifier.
>   	 */
>   	if (SCX_HAS_OP(core_sched_before) && !scx_rq_bypassing(task_rq(a)))
> -		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before,
> +		return SCX_CALL_OP_2TASKS_RET(core_sched_before,
>   					      (struct task_struct *)a,
>   					      (struct task_struct *)b);
>   	else
> 
> -Andrea
> 

Thanks for pointing this out.

I made the mistake of not enabling CONFIG_SCHED_CORE, which led to not
noticing this issue.

If you find any other issues, please let me know.

I will fix these issues in the next version.

>>
>> Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
>> ---
>>   include/linux/sched/ext.h |  24 ----
>>   kernel/sched/ext.c        | 227 ++++++++------------------------------
>>   kernel/sched/ext_idle.c   |   5 +-
>>   3 files changed, 50 insertions(+), 206 deletions(-)
>>
>> diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
>> index f7545430a548..9980d6b55c84 100644
>> --- a/include/linux/sched/ext.h
>> +++ b/include/linux/sched/ext.h
>> @@ -96,29 +96,6 @@ enum scx_ent_dsq_flags {
>>   	SCX_TASK_DSQ_ON_PRIQ	= 1 << 0, /* task is queued on the priority queue of a dsq */
>>   };
>>   
>> -/*
>> - * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
>> - * everywhere and the following bits track which kfunc sets are currently
>> - * allowed for %current. This simple per-task tracking works because SCX ops
>> - * nest in a limited way. BPF will likely implement a way to allow and disallow
>> - * kfuncs depending on the calling context which will replace this manual
>> - * mechanism. See scx_kf_allow().
>> - */
>> -enum scx_kf_mask {
>> -	SCX_KF_UNLOCKED		= 0,	  /* sleepable and not rq locked */
>> -	/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
>> -	SCX_KF_CPU_RELEASE	= 1 << 0, /* ops.cpu_release() */
>> -	/* ops.dequeue (in REST) may be nested inside DISPATCH */
>> -	SCX_KF_DISPATCH		= 1 << 1, /* ops.dispatch() */
>> -	SCX_KF_ENQUEUE		= 1 << 2, /* ops.enqueue() and ops.select_cpu() */
>> -	SCX_KF_SELECT_CPU	= 1 << 3, /* ops.select_cpu() */
>> -	SCX_KF_REST		= 1 << 4, /* other rq-locked operations */
>> -
>> -	__SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
>> -				  SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
>> -	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
>> -};
>> -
>>   enum scx_dsq_lnode_flags {
>>   	SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0,
>>   
>> @@ -147,7 +124,6 @@ struct sched_ext_entity {
>>   	s32			sticky_cpu;
>>   	s32			holding_cpu;
>>   	s32			selected_cpu;
>> -	u32			kf_mask;	/* see scx_kf_mask above */
>>   	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
>>   	atomic_long_t		ops_state;
>>   
>> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
>> index c337f6206ae5..7dc5f11be66b 100644
>> --- a/kernel/sched/ext.c
>> +++ b/kernel/sched/ext.c
>> @@ -1115,19 +1115,6 @@ static long jiffies_delta_msecs(unsigned long at, unsigned long now)
>>   		return -(long)jiffies_to_msecs(now - at);
>>   }
>>   
>> -/* if the highest set bit is N, return a mask with bits [N+1, 31] set */
>> -static u32 higher_bits(u32 flags)
>> -{
>> -	return ~((1 << fls(flags)) - 1);
>> -}
>> -
>> -/* return the mask with only the highest bit set */
>> -static u32 highest_bit(u32 flags)
>> -{
>> -	int bit = fls(flags);
>> -	return ((u64)1 << bit) >> 1;
>> -}
>> -
>>   static bool u32_before(u32 a, u32 b)
>>   {
>>   	return (s32)(a - b) < 0;
>> @@ -1143,51 +1130,12 @@ static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
>>   	return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
>>   }
>>   
>> -/*
>> - * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
>> - * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
>> - * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
>> - * whether it's running from an allowed context.
>> - *
>> - * @mask is constant, always inline to cull the mask calculations.
>> - */
>> -static __always_inline void scx_kf_allow(u32 mask)
>> -{
>> -	/* nesting is allowed only in increasing scx_kf_mask order */
>> -	WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
>> -		  "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
>> -		  current->scx.kf_mask, mask);
>> -	current->scx.kf_mask |= mask;
>> -	barrier();
>> -}
>> -
>> -static void scx_kf_disallow(u32 mask)
>> -{
>> -	barrier();
>> -	current->scx.kf_mask &= ~mask;
>> -}
>> -
>> -#define SCX_CALL_OP(mask, op, args...)						\
>> -do {										\
>> -	if (mask) {								\
>> -		scx_kf_allow(mask);						\
>> -		scx_ops.op(args);						\
>> -		scx_kf_disallow(mask);						\
>> -	} else {								\
>> -		scx_ops.op(args);						\
>> -	}									\
>> -} while (0)
>> +#define SCX_CALL_OP(op, args...)	scx_ops.op(args)
>>   
>> -#define SCX_CALL_OP_RET(mask, op, args...)					\
>> +#define SCX_CALL_OP_RET(op, args...)						\
>>   ({										\
>>   	__typeof__(scx_ops.op(args)) __ret;					\
>> -	if (mask) {								\
>> -		scx_kf_allow(mask);						\
>> -		__ret = scx_ops.op(args);					\
>> -		scx_kf_disallow(mask);						\
>> -	} else {								\
>> -		__ret = scx_ops.op(args);					\
>> -	}									\
>> +	__ret = scx_ops.op(args);						\
>>   	__ret;									\
>>   })
>>   
>> @@ -1202,74 +1150,36 @@ do {										\
>>    * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
>>    * the specific task.
>>    */
>> -#define SCX_CALL_OP_TASK(mask, op, task, args...)				\
>> +#define SCX_CALL_OP_TASK(op, task, args...)					\
>>   do {										\
>> -	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
>>   	current->scx.kf_tasks[0] = task;					\
>> -	SCX_CALL_OP(mask, op, task, ##args);					\
>> +	SCX_CALL_OP(op, task, ##args);						\
>>   	current->scx.kf_tasks[0] = NULL;					\
>>   } while (0)
>>   
>> -#define SCX_CALL_OP_TASK_RET(mask, op, task, args...)				\
>> +#define SCX_CALL_OP_TASK_RET(op, task, args...)					\
>>   ({										\
>>   	__typeof__(scx_ops.op(task, ##args)) __ret;				\
>> -	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
>>   	current->scx.kf_tasks[0] = task;					\
>> -	__ret = SCX_CALL_OP_RET(mask, op, task, ##args);			\
>> +	__ret = SCX_CALL_OP_RET(op, task, ##args);				\
>>   	current->scx.kf_tasks[0] = NULL;					\
>>   	__ret;									\
>>   })
>>   
>> -#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...)			\
>> +#define SCX_CALL_OP_2TASKS_RET(op, task0, task1, args...)			\
>>   ({										\
>>   	__typeof__(scx_ops.op(task0, task1, ##args)) __ret;			\
>> -	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
>>   	current->scx.kf_tasks[0] = task0;					\
>>   	current->scx.kf_tasks[1] = task1;					\
>> -	__ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args);		\
>> +	__ret = SCX_CALL_OP_RET(op, task0, task1, ##args);			\
>>   	current->scx.kf_tasks[0] = NULL;					\
>>   	current->scx.kf_tasks[1] = NULL;					\
>>   	__ret;									\
>>   })
>>   
>> -/* @mask is constant, always inline to cull unnecessary branches */
>> -static __always_inline bool scx_kf_allowed(u32 mask)
>> -{
>> -	if (unlikely(!(current->scx.kf_mask & mask))) {
>> -		scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
>> -			      mask, current->scx.kf_mask);
>> -		return false;
>> -	}
>> -
>> -	/*
>> -	 * Enforce nesting boundaries. e.g. A kfunc which can be called from
>> -	 * DISPATCH must not be called if we're running DEQUEUE which is nested
>> -	 * inside ops.dispatch(). We don't need to check boundaries for any
>> -	 * blocking kfuncs as the verifier ensures they're only called from
>> -	 * sleepable progs.
>> -	 */
>> -	if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
>> -		     (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
>> -		scx_ops_error("cpu_release kfunc called from a nested operation");
>> -		return false;
>> -	}
>> -
>> -	if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
>> -		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
>> -		scx_ops_error("dispatch kfunc called from a nested operation");
>> -		return false;
>> -	}
>> -
>> -	return true;
>> -}
>> -
>>   /* see SCX_CALL_OP_TASK() */
>> -static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
>> -							struct task_struct *p)
>> +static __always_inline bool scx_kf_allowed_on_arg_tasks(struct task_struct *p)
>>   {
>> -	if (!scx_kf_allowed(mask))
>> -		return false;
>> -
>>   	if (unlikely((p != current->scx.kf_tasks[0] &&
>>   		      p != current->scx.kf_tasks[1]))) {
>>   		scx_ops_error("called on a task not being operated on");
>> @@ -1279,11 +1189,6 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
>>   	return true;
>>   }
>>   
>> -static bool scx_kf_allowed_if_unlocked(void)
>> -{
>> -	return !current->scx.kf_mask;
>> -}
>> -
>>   /**
>>    * nldsq_next_task - Iterate to the next task in a non-local DSQ
>>    * @dsq: user dsq being iterated
>> @@ -2219,7 +2124,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
>>   	WARN_ON_ONCE(*ddsp_taskp);
>>   	*ddsp_taskp = p;
>>   
>> -	SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
>> +	SCX_CALL_OP_TASK(enqueue, p, enq_flags);
>>   
>>   	*ddsp_taskp = NULL;
>>   	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
>> @@ -2316,7 +2221,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
>>   	add_nr_running(rq, 1);
>>   
>>   	if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
>> -		SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
>> +		SCX_CALL_OP_TASK(runnable, p, enq_flags);
>>   
>>   	if (enq_flags & SCX_ENQ_WAKEUP)
>>   		touch_core_sched(rq, p);
>> @@ -2351,7 +2256,7 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
>>   		BUG();
>>   	case SCX_OPSS_QUEUED:
>>   		if (SCX_HAS_OP(dequeue))
>> -			SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
>> +			SCX_CALL_OP_TASK(dequeue, p, deq_flags);
>>   
>>   		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
>>   					    SCX_OPSS_NONE))
>> @@ -2400,11 +2305,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
>>   	 */
>>   	if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
>>   		update_curr_scx(rq);
>> -		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
>> +		SCX_CALL_OP_TASK(stopping, p, false);
>>   	}
>>   
>>   	if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
>> -		SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
>> +		SCX_CALL_OP_TASK(quiescent, p, deq_flags);
>>   
>>   	if (deq_flags & SCX_DEQ_SLEEP)
>>   		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
>> @@ -2424,7 +2329,7 @@ static void yield_task_scx(struct rq *rq)
>>   	struct task_struct *p = rq->curr;
>>   
>>   	if (SCX_HAS_OP(yield))
>> -		SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL);
>> +		SCX_CALL_OP_2TASKS_RET(yield, p, NULL);
>>   	else
>>   		p->scx.slice = 0;
>>   }
>> @@ -2434,7 +2339,7 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
>>   	struct task_struct *from = rq->curr;
>>   
>>   	if (SCX_HAS_OP(yield))
>> -		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to);
>> +		return SCX_CALL_OP_2TASKS_RET(yield, from, to);
>>   	else
>>   		return false;
>>   }
>> @@ -2992,7 +2897,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
>>   		 * emitted in switch_class().
>>   		 */
>>   		if (SCX_HAS_OP(cpu_acquire))
>> -			SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL);
>> +			SCX_CALL_OP(cpu_acquire, cpu_of(rq), NULL);
>>   		rq->scx.cpu_released = false;
>>   	}
>>   
>> @@ -3037,8 +2942,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
>>   	do {
>>   		dspc->nr_tasks = 0;
>>   
>> -		SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
>> -			    prev_on_scx ? prev : NULL);
>> +		SCX_CALL_OP(dispatch, cpu_of(rq), prev_on_scx ? prev : NULL);
>>   
>>   		flush_dispatch_buf(rq);
>>   
>> @@ -3159,7 +3063,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
>>   
>>   	/* see dequeue_task_scx() on why we skip when !QUEUED */
>>   	if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
>> -		SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
>> +		SCX_CALL_OP_TASK(running, p);
>>   
>>   	clr_task_runnable(p, true);
>>   
>> @@ -3240,8 +3144,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
>>   				.task = next,
>>   			};
>>   
>> -			SCX_CALL_OP(SCX_KF_CPU_RELEASE,
>> -				    cpu_release, cpu_of(rq), &args);
>> +			SCX_CALL_OP(cpu_release, cpu_of(rq), &args);
>>   		}
>>   		rq->scx.cpu_released = true;
>>   	}
>> @@ -3254,7 +3157,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
>>   
>>   	/* see dequeue_task_scx() on why we skip when !QUEUED */
>>   	if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
>> -		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
>> +		SCX_CALL_OP_TASK(stopping, p, true);
>>   
>>   	if (p->scx.flags & SCX_TASK_QUEUED) {
>>   		set_task_runnable(rq, p);
>> @@ -3428,8 +3331,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
>>   		WARN_ON_ONCE(*ddsp_taskp);
>>   		*ddsp_taskp = p;
>>   
>> -		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
>> -					   select_cpu, p, prev_cpu, wake_flags);
>> +		cpu = SCX_CALL_OP_TASK_RET(select_cpu, p, prev_cpu, wake_flags);
>>   		p->scx.selected_cpu = cpu;
>>   		*ddsp_taskp = NULL;
>>   		if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
>> @@ -3473,8 +3375,7 @@ static void set_cpus_allowed_scx(struct task_struct *p,
>>   	 * designation pointless. Cast it away when calling the operation.
>>   	 */
>>   	if (SCX_HAS_OP(set_cpumask))
>> -		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
>> -				 (struct cpumask *)p->cpus_ptr);
>> +		SCX_CALL_OP_TASK(set_cpumask, p, (struct cpumask *)p->cpus_ptr);
>>   }
>>   
>>   static void handle_hotplug(struct rq *rq, bool online)
>> @@ -3487,9 +3388,9 @@ static void handle_hotplug(struct rq *rq, bool online)
>>   		scx_idle_update_selcpu_topology(&scx_ops);
>>   
>>   	if (online && SCX_HAS_OP(cpu_online))
>> -		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
>> +		SCX_CALL_OP(cpu_online, cpu);
>>   	else if (!online && SCX_HAS_OP(cpu_offline))
>> -		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu);
>> +		SCX_CALL_OP(cpu_offline, cpu);
>>   	else
>>   		scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
>>   			     "cpu %d going %s, exiting scheduler", cpu,
>> @@ -3593,7 +3494,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
>>   		curr->scx.slice = 0;
>>   		touch_core_sched(rq, curr);
>>   	} else if (SCX_HAS_OP(tick)) {
>> -		SCX_CALL_OP(SCX_KF_REST, tick, curr);
>> +		SCX_CALL_OP(tick, curr);
>>   	}
>>   
>>   	if (!curr->scx.slice)
>> @@ -3670,7 +3571,7 @@ static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
>>   			.fork = fork,
>>   		};
>>   
>> -		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args);
>> +		ret = SCX_CALL_OP_RET(init_task, p, &args);
>>   		if (unlikely(ret)) {
>>   			ret = ops_sanitize_err("init_task", ret);
>>   			return ret;
>> @@ -3727,11 +3628,11 @@ static void scx_ops_enable_task(struct task_struct *p)
>>   	p->scx.weight = sched_weight_to_cgroup(weight);
>>   
>>   	if (SCX_HAS_OP(enable))
>> -		SCX_CALL_OP_TASK(SCX_KF_REST, enable, p);
>> +		SCX_CALL_OP_TASK(enable, p);
>>   	scx_set_task_state(p, SCX_TASK_ENABLED);
>>   
>>   	if (SCX_HAS_OP(set_weight))
>> -		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
>> +		SCX_CALL_OP_TASK(set_weight, p, p->scx.weight);
>>   }
>>   
>>   static void scx_ops_disable_task(struct task_struct *p)
>> @@ -3740,7 +3641,7 @@ static void scx_ops_disable_task(struct task_struct *p)
>>   	WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
>>   
>>   	if (SCX_HAS_OP(disable))
>> -		SCX_CALL_OP(SCX_KF_REST, disable, p);
>> +		SCX_CALL_OP(disable, p);
>>   	scx_set_task_state(p, SCX_TASK_READY);
>>   }
>>   
>> @@ -3769,7 +3670,7 @@ static void scx_ops_exit_task(struct task_struct *p)
>>   	}
>>   
>>   	if (SCX_HAS_OP(exit_task))
>> -		SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args);
>> +		SCX_CALL_OP(exit_task, p, &args);
>>   	scx_set_task_state(p, SCX_TASK_NONE);
>>   }
>>   
>> @@ -3878,7 +3779,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
>>   
>>   	p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
>>   	if (SCX_HAS_OP(set_weight))
>> -		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
>> +		SCX_CALL_OP_TASK(set_weight, p, p->scx.weight);
>>   }
>>   
>>   static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
>> @@ -3894,8 +3795,7 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
>>   	 * different scheduler class. Keep the BPF scheduler up-to-date.
>>   	 */
>>   	if (SCX_HAS_OP(set_cpumask))
>> -		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
>> -				 (struct cpumask *)p->cpus_ptr);
>> +		SCX_CALL_OP_TASK(set_cpumask, p, (struct cpumask *)p->cpus_ptr);
>>   }
>>   
>>   static void switched_from_scx(struct rq *rq, struct task_struct *p)
>> @@ -3987,8 +3887,7 @@ int scx_tg_online(struct task_group *tg)
>>   			struct scx_cgroup_init_args args =
>>   				{ .weight = tg->scx_weight };
>>   
>> -			ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
>> -					      tg->css.cgroup, &args);
>> +			ret = SCX_CALL_OP_RET(cgroup_init, tg->css.cgroup, &args);
>>   			if (ret)
>>   				ret = ops_sanitize_err("cgroup_init", ret);
>>   		}
>> @@ -4009,7 +3908,7 @@ void scx_tg_offline(struct task_group *tg)
>>   	percpu_down_read(&scx_cgroup_rwsem);
>>   
>>   	if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
>> -		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup);
>> +		SCX_CALL_OP(cgroup_exit, tg->css.cgroup);
>>   	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
>>   
>>   	percpu_up_read(&scx_cgroup_rwsem);
>> @@ -4042,8 +3941,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
>>   			continue;
>>   
>>   		if (SCX_HAS_OP(cgroup_prep_move)) {
>> -			ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move,
>> -					      p, from, css->cgroup);
>> +			ret = SCX_CALL_OP_RET(cgroup_prep_move, p, from, css->cgroup);
>>   			if (ret)
>>   				goto err;
>>   		}
>> @@ -4056,8 +3954,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
>>   err:
>>   	cgroup_taskset_for_each(p, css, tset) {
>>   		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
>> -			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
>> -				    p->scx.cgrp_moving_from, css->cgroup);
>> +			SCX_CALL_OP(cgroup_cancel_move, p, p->scx.cgrp_moving_from, css->cgroup);
>>   		p->scx.cgrp_moving_from = NULL;
>>   	}
>>   
>> @@ -4075,8 +3972,7 @@ void scx_cgroup_move_task(struct task_struct *p)
>>   	 * cgrp_moving_from set.
>>   	 */
>>   	if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
>> -		SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
>> -			p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
>> +		SCX_CALL_OP_TASK(cgroup_move, p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
>>   	p->scx.cgrp_moving_from = NULL;
>>   }
>>   
>> @@ -4095,8 +3991,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
>>   
>>   	cgroup_taskset_for_each(p, css, tset) {
>>   		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
>> -			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
>> -				    p->scx.cgrp_moving_from, css->cgroup);
>> +			SCX_CALL_OP(cgroup_cancel_move, p, p->scx.cgrp_moving_from, css->cgroup);
>>   		p->scx.cgrp_moving_from = NULL;
>>   	}
>>   out_unlock:
>> @@ -4109,8 +4004,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
>>   
>>   	if (scx_cgroup_enabled && tg->scx_weight != weight) {
>>   		if (SCX_HAS_OP(cgroup_set_weight))
>> -			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
>> -				    tg_cgrp(tg), weight);
>> +			SCX_CALL_OP(cgroup_set_weight, tg_cgrp(tg), weight);
>>   		tg->scx_weight = weight;
>>   	}
>>   
>> @@ -4300,7 +4194,7 @@ static void scx_cgroup_exit(void)
>>   			continue;
>>   		rcu_read_unlock();
>>   
>> -		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
>> +		SCX_CALL_OP(cgroup_exit, css->cgroup);
>>   
>>   		rcu_read_lock();
>>   		css_put(css);
>> @@ -4343,8 +4237,7 @@ static int scx_cgroup_init(void)
>>   			continue;
>>   		rcu_read_unlock();
>>   
>> -		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
>> -				      css->cgroup, &args);
>> +		ret = SCX_CALL_OP_RET(cgroup_init, css->cgroup, &args);
>>   		if (ret) {
>>   			css_put(css);
>>   			scx_ops_error("ops.cgroup_init() failed (%d)", ret);
>> @@ -4840,7 +4733,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
>>   	}
>>   
>>   	if (scx_ops.exit)
>> -		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
>> +		SCX_CALL_OP(exit, ei);
>>   
>>   	cancel_delayed_work_sync(&scx_watchdog_work);
>>   
>> @@ -5047,7 +4940,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
>>   
>>   	if (SCX_HAS_OP(dump_task)) {
>>   		ops_dump_init(s, "    ");
>> -		SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
>> +		SCX_CALL_OP(dump_task, dctx, p);
>>   		ops_dump_exit();
>>   	}
>>   
>> @@ -5094,7 +4987,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
>>   
>>   	if (SCX_HAS_OP(dump)) {
>>   		ops_dump_init(&s, "");
>> -		SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
>> +		SCX_CALL_OP(dump, &dctx);
>>   		ops_dump_exit();
>>   	}
>>   
>> @@ -5151,7 +5044,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
>>   		used = seq_buf_used(&ns);
>>   		if (SCX_HAS_OP(dump_cpu)) {
>>   			ops_dump_init(&ns, "  ");
>> -			SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
>> +			SCX_CALL_OP(dump_cpu, &dctx, cpu, idle);
>>   			ops_dump_exit();
>>   		}
>>   
>> @@ -5405,7 +5298,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>>   	cpus_read_lock();
>>   
>>   	if (scx_ops.init) {
>> -		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
>> +		ret = SCX_CALL_OP_RET(init);
>>   		if (ret) {
>>   			ret = ops_sanitize_err("init", ret);
>>   			cpus_read_unlock();
>> @@ -6146,9 +6039,6 @@ void __init init_sched_ext_class(void)
>>    */
>>   static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
>>   {
>> -	if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
>> -		return false;
>> -
>>   	lockdep_assert_irqs_disabled();
>>   
>>   	if (unlikely(!p)) {
>> @@ -6310,9 +6200,6 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
>>   	bool in_balance;
>>   	unsigned long flags;
>>   
>> -	if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH))
>> -		return false;
>> -
>>   	/*
>>   	 * Can be called from either ops.dispatch() locking this_rq() or any
>>   	 * context where no rq lock is held. If latter, lock @p's task_rq which
>> @@ -6395,9 +6282,6 @@ __bpf_kfunc_start_defs();
>>    */
>>   __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
>>   {
>> -	if (!scx_kf_allowed(SCX_KF_DISPATCH))
>> -		return 0;
>> -
>>   	return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
>>   }
>>   
>> @@ -6411,9 +6295,6 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
>>   {
>>   	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
>>   
>> -	if (!scx_kf_allowed(SCX_KF_DISPATCH))
>> -		return;
>> -
>>   	if (dspc->cursor > 0)
>>   		dspc->cursor--;
>>   	else
>> @@ -6439,9 +6320,6 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
>>   	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
>>   	struct scx_dispatch_q *dsq;
>>   
>> -	if (!scx_kf_allowed(SCX_KF_DISPATCH))
>> -		return false;
>> -
>>   	flush_dispatch_buf(dspc->rq);
>>   
>>   	dsq = find_user_dsq(dsq_id);
>> @@ -6632,9 +6510,6 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
>>   	struct rq *rq;
>>   	struct task_struct *p, *n;
>>   
>> -	if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
>> -		return 0;
>> -
>>   	rq = cpu_rq(smp_processor_id());
>>   	lockdep_assert_rq_held(rq);
>>   
>> @@ -7239,7 +7114,7 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
>>   	struct task_group *tg = p->sched_task_group;
>>   	struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
>>   
>> -	if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
>> +	if (!scx_kf_allowed_on_arg_tasks(p))
>>   		goto out;
>>   
>>   	cgrp = tg_cgrp(tg);
>> @@ -7479,10 +7354,6 @@ static int __init scx_init(void)
>>   	 *
>>   	 * Some kfuncs are context-sensitive and can only be called from
>>   	 * specific SCX ops. They are grouped into BTF sets accordingly.
>> -	 * Unfortunately, BPF currently doesn't have a way of enforcing such
>> -	 * restrictions. Eventually, the verifier should be able to enforce
>> -	 * them. For now, register them the same and make each kfunc explicitly
>> -	 * check using scx_kf_allowed().
>>   	 */
>>   	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
>>   					     &scx_kfunc_set_ops_context_sensitive)) ||
>> diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
>> index efb6077810d8..e241935021eb 100644
>> --- a/kernel/sched/ext_idle.c
>> +++ b/kernel/sched/ext_idle.c
>> @@ -658,7 +658,7 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
>>   	 * managed by put_prev_task_idle()/set_next_task_idle().
>>   	 */
>>   	if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
>> -		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
>> +		SCX_CALL_OP(update_idle, cpu_of(rq), idle);
>>   
>>   	/*
>>   	 * Update the idle masks:
>> @@ -803,9 +803,6 @@ __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
>>   	if (!check_builtin_idle_enabled())
>>   		goto prev_cpu;
>>   
>> -	if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
>> -		goto prev_cpu;
>> -
>>   #ifdef CONFIG_SMP
>>   	return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
>>   #endif
>> -- 
>> 2.39.5
>>
diff mbox series

Patch

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index f7545430a548..9980d6b55c84 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -96,29 +96,6 @@  enum scx_ent_dsq_flags {
 	SCX_TASK_DSQ_ON_PRIQ	= 1 << 0, /* task is queued on the priority queue of a dsq */
 };
 
-/*
- * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
- * everywhere and the following bits track which kfunc sets are currently
- * allowed for %current. This simple per-task tracking works because SCX ops
- * nest in a limited way. BPF will likely implement a way to allow and disallow
- * kfuncs depending on the calling context which will replace this manual
- * mechanism. See scx_kf_allow().
- */
-enum scx_kf_mask {
-	SCX_KF_UNLOCKED		= 0,	  /* sleepable and not rq locked */
-	/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
-	SCX_KF_CPU_RELEASE	= 1 << 0, /* ops.cpu_release() */
-	/* ops.dequeue (in REST) may be nested inside DISPATCH */
-	SCX_KF_DISPATCH		= 1 << 1, /* ops.dispatch() */
-	SCX_KF_ENQUEUE		= 1 << 2, /* ops.enqueue() and ops.select_cpu() */
-	SCX_KF_SELECT_CPU	= 1 << 3, /* ops.select_cpu() */
-	SCX_KF_REST		= 1 << 4, /* other rq-locked operations */
-
-	__SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
-				  SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
-	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
-};
-
 enum scx_dsq_lnode_flags {
 	SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0,
 
@@ -147,7 +124,6 @@  struct sched_ext_entity {
 	s32			sticky_cpu;
 	s32			holding_cpu;
 	s32			selected_cpu;
-	u32			kf_mask;	/* see scx_kf_mask above */
 	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
 	atomic_long_t		ops_state;
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index c337f6206ae5..7dc5f11be66b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1115,19 +1115,6 @@  static long jiffies_delta_msecs(unsigned long at, unsigned long now)
 		return -(long)jiffies_to_msecs(now - at);
 }
 
-/* if the highest set bit is N, return a mask with bits [N+1, 31] set */
-static u32 higher_bits(u32 flags)
-{
-	return ~((1 << fls(flags)) - 1);
-}
-
-/* return the mask with only the highest bit set */
-static u32 highest_bit(u32 flags)
-{
-	int bit = fls(flags);
-	return ((u64)1 << bit) >> 1;
-}
-
 static bool u32_before(u32 a, u32 b)
 {
 	return (s32)(a - b) < 0;
@@ -1143,51 +1130,12 @@  static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
 	return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
 }
 
-/*
- * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
- * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
- * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
- * whether it's running from an allowed context.
- *
- * @mask is constant, always inline to cull the mask calculations.
- */
-static __always_inline void scx_kf_allow(u32 mask)
-{
-	/* nesting is allowed only in increasing scx_kf_mask order */
-	WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
-		  "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
-		  current->scx.kf_mask, mask);
-	current->scx.kf_mask |= mask;
-	barrier();
-}
-
-static void scx_kf_disallow(u32 mask)
-{
-	barrier();
-	current->scx.kf_mask &= ~mask;
-}
-
-#define SCX_CALL_OP(mask, op, args...)						\
-do {										\
-	if (mask) {								\
-		scx_kf_allow(mask);						\
-		scx_ops.op(args);						\
-		scx_kf_disallow(mask);						\
-	} else {								\
-		scx_ops.op(args);						\
-	}									\
-} while (0)
+#define SCX_CALL_OP(op, args...)	scx_ops.op(args)
 
-#define SCX_CALL_OP_RET(mask, op, args...)					\
+#define SCX_CALL_OP_RET(op, args...)						\
 ({										\
 	__typeof__(scx_ops.op(args)) __ret;					\
-	if (mask) {								\
-		scx_kf_allow(mask);						\
-		__ret = scx_ops.op(args);					\
-		scx_kf_disallow(mask);						\
-	} else {								\
-		__ret = scx_ops.op(args);					\
-	}									\
+	__ret = scx_ops.op(args);						\
 	__ret;									\
 })
 
@@ -1202,74 +1150,36 @@  do {										\
  * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
  * the specific task.
  */
-#define SCX_CALL_OP_TASK(mask, op, task, args...)				\
+#define SCX_CALL_OP_TASK(op, task, args...)					\
 do {										\
-	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
 	current->scx.kf_tasks[0] = task;					\
-	SCX_CALL_OP(mask, op, task, ##args);					\
+	SCX_CALL_OP(op, task, ##args);						\
 	current->scx.kf_tasks[0] = NULL;					\
 } while (0)
 
-#define SCX_CALL_OP_TASK_RET(mask, op, task, args...)				\
+#define SCX_CALL_OP_TASK_RET(op, task, args...)					\
 ({										\
 	__typeof__(scx_ops.op(task, ##args)) __ret;				\
-	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
 	current->scx.kf_tasks[0] = task;					\
-	__ret = SCX_CALL_OP_RET(mask, op, task, ##args);			\
+	__ret = SCX_CALL_OP_RET(op, task, ##args);				\
 	current->scx.kf_tasks[0] = NULL;					\
 	__ret;									\
 })
 
-#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...)			\
+#define SCX_CALL_OP_2TASKS_RET(op, task0, task1, args...)			\
 ({										\
 	__typeof__(scx_ops.op(task0, task1, ##args)) __ret;			\
-	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
 	current->scx.kf_tasks[0] = task0;					\
 	current->scx.kf_tasks[1] = task1;					\
-	__ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args);		\
+	__ret = SCX_CALL_OP_RET(op, task0, task1, ##args);			\
 	current->scx.kf_tasks[0] = NULL;					\
 	current->scx.kf_tasks[1] = NULL;					\
 	__ret;									\
 })
 
-/* @mask is constant, always inline to cull unnecessary branches */
-static __always_inline bool scx_kf_allowed(u32 mask)
-{
-	if (unlikely(!(current->scx.kf_mask & mask))) {
-		scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
-			      mask, current->scx.kf_mask);
-		return false;
-	}
-
-	/*
-	 * Enforce nesting boundaries. e.g. A kfunc which can be called from
-	 * DISPATCH must not be called if we're running DEQUEUE which is nested
-	 * inside ops.dispatch(). We don't need to check boundaries for any
-	 * blocking kfuncs as the verifier ensures they're only called from
-	 * sleepable progs.
-	 */
-	if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
-		     (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
-		scx_ops_error("cpu_release kfunc called from a nested operation");
-		return false;
-	}
-
-	if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
-		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
-		scx_ops_error("dispatch kfunc called from a nested operation");
-		return false;
-	}
-
-	return true;
-}
-
 /* see SCX_CALL_OP_TASK() */
-static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
-							struct task_struct *p)
+static __always_inline bool scx_kf_allowed_on_arg_tasks(struct task_struct *p)
 {
-	if (!scx_kf_allowed(mask))
-		return false;
-
 	if (unlikely((p != current->scx.kf_tasks[0] &&
 		      p != current->scx.kf_tasks[1]))) {
 		scx_ops_error("called on a task not being operated on");
@@ -1279,11 +1189,6 @@  static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask,
 	return true;
 }
 
-static bool scx_kf_allowed_if_unlocked(void)
-{
-	return !current->scx.kf_mask;
-}
-
 /**
  * nldsq_next_task - Iterate to the next task in a non-local DSQ
  * @dsq: user dsq being iterated
@@ -2219,7 +2124,7 @@  static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	WARN_ON_ONCE(*ddsp_taskp);
 	*ddsp_taskp = p;
 
-	SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
+	SCX_CALL_OP_TASK(enqueue, p, enq_flags);
 
 	*ddsp_taskp = NULL;
 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -2316,7 +2221,7 @@  static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 	add_nr_running(rq, 1);
 
 	if (SCX_HAS_OP(runnable) && !task_on_rq_migrating(p))
-		SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags);
+		SCX_CALL_OP_TASK(runnable, p, enq_flags);
 
 	if (enq_flags & SCX_ENQ_WAKEUP)
 		touch_core_sched(rq, p);
@@ -2351,7 +2256,7 @@  static void ops_dequeue(struct task_struct *p, u64 deq_flags)
 		BUG();
 	case SCX_OPSS_QUEUED:
 		if (SCX_HAS_OP(dequeue))
-			SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags);
+			SCX_CALL_OP_TASK(dequeue, p, deq_flags);
 
 		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
 					    SCX_OPSS_NONE))
@@ -2400,11 +2305,11 @@  static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 	 */
 	if (SCX_HAS_OP(stopping) && task_current(rq, p)) {
 		update_curr_scx(rq);
-		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false);
+		SCX_CALL_OP_TASK(stopping, p, false);
 	}
 
 	if (SCX_HAS_OP(quiescent) && !task_on_rq_migrating(p))
-		SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags);
+		SCX_CALL_OP_TASK(quiescent, p, deq_flags);
 
 	if (deq_flags & SCX_DEQ_SLEEP)
 		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
@@ -2424,7 +2329,7 @@  static void yield_task_scx(struct rq *rq)
 	struct task_struct *p = rq->curr;
 
 	if (SCX_HAS_OP(yield))
-		SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL);
+		SCX_CALL_OP_2TASKS_RET(yield, p, NULL);
 	else
 		p->scx.slice = 0;
 }
@@ -2434,7 +2339,7 @@  static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
 	struct task_struct *from = rq->curr;
 
 	if (SCX_HAS_OP(yield))
-		return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to);
+		return SCX_CALL_OP_2TASKS_RET(yield, from, to);
 	else
 		return false;
 }
@@ -2992,7 +2897,7 @@  static int balance_one(struct rq *rq, struct task_struct *prev)
 		 * emitted in switch_class().
 		 */
 		if (SCX_HAS_OP(cpu_acquire))
-			SCX_CALL_OP(SCX_KF_REST, cpu_acquire, cpu_of(rq), NULL);
+			SCX_CALL_OP(cpu_acquire, cpu_of(rq), NULL);
 		rq->scx.cpu_released = false;
 	}
 
@@ -3037,8 +2942,7 @@  static int balance_one(struct rq *rq, struct task_struct *prev)
 	do {
 		dspc->nr_tasks = 0;
 
-		SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
-			    prev_on_scx ? prev : NULL);
+		SCX_CALL_OP(dispatch, cpu_of(rq), prev_on_scx ? prev : NULL);
 
 		flush_dispatch_buf(rq);
 
@@ -3159,7 +3063,7 @@  static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 
 	/* see dequeue_task_scx() on why we skip when !QUEUED */
 	if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED))
-		SCX_CALL_OP_TASK(SCX_KF_REST, running, p);
+		SCX_CALL_OP_TASK(running, p);
 
 	clr_task_runnable(p, true);
 
@@ -3240,8 +3144,7 @@  static void switch_class(struct rq *rq, struct task_struct *next)
 				.task = next,
 			};
 
-			SCX_CALL_OP(SCX_KF_CPU_RELEASE,
-				    cpu_release, cpu_of(rq), &args);
+			SCX_CALL_OP(cpu_release, cpu_of(rq), &args);
 		}
 		rq->scx.cpu_released = true;
 	}
@@ -3254,7 +3157,7 @@  static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
 
 	/* see dequeue_task_scx() on why we skip when !QUEUED */
 	if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED))
-		SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true);
+		SCX_CALL_OP_TASK(stopping, p, true);
 
 	if (p->scx.flags & SCX_TASK_QUEUED) {
 		set_task_runnable(rq, p);
@@ -3428,8 +3331,7 @@  static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 		WARN_ON_ONCE(*ddsp_taskp);
 		*ddsp_taskp = p;
 
-		cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
-					   select_cpu, p, prev_cpu, wake_flags);
+		cpu = SCX_CALL_OP_TASK_RET(select_cpu, p, prev_cpu, wake_flags);
 		p->scx.selected_cpu = cpu;
 		*ddsp_taskp = NULL;
 		if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
@@ -3473,8 +3375,7 @@  static void set_cpus_allowed_scx(struct task_struct *p,
 	 * designation pointless. Cast it away when calling the operation.
 	 */
 	if (SCX_HAS_OP(set_cpumask))
-		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
-				 (struct cpumask *)p->cpus_ptr);
+		SCX_CALL_OP_TASK(set_cpumask, p, (struct cpumask *)p->cpus_ptr);
 }
 
 static void handle_hotplug(struct rq *rq, bool online)
@@ -3487,9 +3388,9 @@  static void handle_hotplug(struct rq *rq, bool online)
 		scx_idle_update_selcpu_topology(&scx_ops);
 
 	if (online && SCX_HAS_OP(cpu_online))
-		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
+		SCX_CALL_OP(cpu_online, cpu);
 	else if (!online && SCX_HAS_OP(cpu_offline))
-		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_offline, cpu);
+		SCX_CALL_OP(cpu_offline, cpu);
 	else
 		scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
 			     "cpu %d going %s, exiting scheduler", cpu,
@@ -3593,7 +3494,7 @@  static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
 		curr->scx.slice = 0;
 		touch_core_sched(rq, curr);
 	} else if (SCX_HAS_OP(tick)) {
-		SCX_CALL_OP(SCX_KF_REST, tick, curr);
+		SCX_CALL_OP(tick, curr);
 	}
 
 	if (!curr->scx.slice)
@@ -3670,7 +3571,7 @@  static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool
 			.fork = fork,
 		};
 
-		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init_task, p, &args);
+		ret = SCX_CALL_OP_RET(init_task, p, &args);
 		if (unlikely(ret)) {
 			ret = ops_sanitize_err("init_task", ret);
 			return ret;
@@ -3727,11 +3628,11 @@  static void scx_ops_enable_task(struct task_struct *p)
 	p->scx.weight = sched_weight_to_cgroup(weight);
 
 	if (SCX_HAS_OP(enable))
-		SCX_CALL_OP_TASK(SCX_KF_REST, enable, p);
+		SCX_CALL_OP_TASK(enable, p);
 	scx_set_task_state(p, SCX_TASK_ENABLED);
 
 	if (SCX_HAS_OP(set_weight))
-		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+		SCX_CALL_OP_TASK(set_weight, p, p->scx.weight);
 }
 
 static void scx_ops_disable_task(struct task_struct *p)
@@ -3740,7 +3641,7 @@  static void scx_ops_disable_task(struct task_struct *p)
 	WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
 
 	if (SCX_HAS_OP(disable))
-		SCX_CALL_OP(SCX_KF_REST, disable, p);
+		SCX_CALL_OP(disable, p);
 	scx_set_task_state(p, SCX_TASK_READY);
 }
 
@@ -3769,7 +3670,7 @@  static void scx_ops_exit_task(struct task_struct *p)
 	}
 
 	if (SCX_HAS_OP(exit_task))
-		SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args);
+		SCX_CALL_OP(exit_task, p, &args);
 	scx_set_task_state(p, SCX_TASK_NONE);
 }
 
@@ -3878,7 +3779,7 @@  static void reweight_task_scx(struct rq *rq, struct task_struct *p,
 
 	p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
 	if (SCX_HAS_OP(set_weight))
-		SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight);
+		SCX_CALL_OP_TASK(set_weight, p, p->scx.weight);
 }
 
 static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
@@ -3894,8 +3795,7 @@  static void switching_to_scx(struct rq *rq, struct task_struct *p)
 	 * different scheduler class. Keep the BPF scheduler up-to-date.
 	 */
 	if (SCX_HAS_OP(set_cpumask))
-		SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p,
-				 (struct cpumask *)p->cpus_ptr);
+		SCX_CALL_OP_TASK(set_cpumask, p, (struct cpumask *)p->cpus_ptr);
 }
 
 static void switched_from_scx(struct rq *rq, struct task_struct *p)
@@ -3987,8 +3887,7 @@  int scx_tg_online(struct task_group *tg)
 			struct scx_cgroup_init_args args =
 				{ .weight = tg->scx_weight };
 
-			ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
-					      tg->css.cgroup, &args);
+			ret = SCX_CALL_OP_RET(cgroup_init, tg->css.cgroup, &args);
 			if (ret)
 				ret = ops_sanitize_err("cgroup_init", ret);
 		}
@@ -4009,7 +3908,7 @@  void scx_tg_offline(struct task_group *tg)
 	percpu_down_read(&scx_cgroup_rwsem);
 
 	if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED))
-		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup);
+		SCX_CALL_OP(cgroup_exit, tg->css.cgroup);
 	tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
 
 	percpu_up_read(&scx_cgroup_rwsem);
@@ -4042,8 +3941,7 @@  int scx_cgroup_can_attach(struct cgroup_taskset *tset)
 			continue;
 
 		if (SCX_HAS_OP(cgroup_prep_move)) {
-			ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move,
-					      p, from, css->cgroup);
+			ret = SCX_CALL_OP_RET(cgroup_prep_move, p, from, css->cgroup);
 			if (ret)
 				goto err;
 		}
@@ -4056,8 +3954,7 @@  int scx_cgroup_can_attach(struct cgroup_taskset *tset)
 err:
 	cgroup_taskset_for_each(p, css, tset) {
 		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
-			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
-				    p->scx.cgrp_moving_from, css->cgroup);
+			SCX_CALL_OP(cgroup_cancel_move, p, p->scx.cgrp_moving_from, css->cgroup);
 		p->scx.cgrp_moving_from = NULL;
 	}
 
@@ -4075,8 +3972,7 @@  void scx_cgroup_move_task(struct task_struct *p)
 	 * cgrp_moving_from set.
 	 */
 	if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
-		SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p,
-			p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
+		SCX_CALL_OP_TASK(cgroup_move, p, p->scx.cgrp_moving_from, tg_cgrp(task_group(p)));
 	p->scx.cgrp_moving_from = NULL;
 }
 
@@ -4095,8 +3991,7 @@  void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
 
 	cgroup_taskset_for_each(p, css, tset) {
 		if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from)
-			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p,
-				    p->scx.cgrp_moving_from, css->cgroup);
+			SCX_CALL_OP(cgroup_cancel_move, p, p->scx.cgrp_moving_from, css->cgroup);
 		p->scx.cgrp_moving_from = NULL;
 	}
 out_unlock:
@@ -4109,8 +4004,7 @@  void scx_group_set_weight(struct task_group *tg, unsigned long weight)
 
 	if (scx_cgroup_enabled && tg->scx_weight != weight) {
 		if (SCX_HAS_OP(cgroup_set_weight))
-			SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight,
-				    tg_cgrp(tg), weight);
+			SCX_CALL_OP(cgroup_set_weight, tg_cgrp(tg), weight);
 		tg->scx_weight = weight;
 	}
 
@@ -4300,7 +4194,7 @@  static void scx_cgroup_exit(void)
 			continue;
 		rcu_read_unlock();
 
-		SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup);
+		SCX_CALL_OP(cgroup_exit, css->cgroup);
 
 		rcu_read_lock();
 		css_put(css);
@@ -4343,8 +4237,7 @@  static int scx_cgroup_init(void)
 			continue;
 		rcu_read_unlock();
 
-		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init,
-				      css->cgroup, &args);
+		ret = SCX_CALL_OP_RET(cgroup_init, css->cgroup, &args);
 		if (ret) {
 			css_put(css);
 			scx_ops_error("ops.cgroup_init() failed (%d)", ret);
@@ -4840,7 +4733,7 @@  static void scx_ops_disable_workfn(struct kthread_work *work)
 	}
 
 	if (scx_ops.exit)
-		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
+		SCX_CALL_OP(exit, ei);
 
 	cancel_delayed_work_sync(&scx_watchdog_work);
 
@@ -5047,7 +4940,7 @@  static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
 
 	if (SCX_HAS_OP(dump_task)) {
 		ops_dump_init(s, "    ");
-		SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
+		SCX_CALL_OP(dump_task, dctx, p);
 		ops_dump_exit();
 	}
 
@@ -5094,7 +4987,7 @@  static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 
 	if (SCX_HAS_OP(dump)) {
 		ops_dump_init(&s, "");
-		SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
+		SCX_CALL_OP(dump, &dctx);
 		ops_dump_exit();
 	}
 
@@ -5151,7 +5044,7 @@  static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 		used = seq_buf_used(&ns);
 		if (SCX_HAS_OP(dump_cpu)) {
 			ops_dump_init(&ns, "  ");
-			SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
+			SCX_CALL_OP(dump_cpu, &dctx, cpu, idle);
 			ops_dump_exit();
 		}
 
@@ -5405,7 +5298,7 @@  static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 	cpus_read_lock();
 
 	if (scx_ops.init) {
-		ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init);
+		ret = SCX_CALL_OP_RET(init);
 		if (ret) {
 			ret = ops_sanitize_err("init", ret);
 			cpus_read_unlock();
@@ -6146,9 +6039,6 @@  void __init init_sched_ext_class(void)
  */
 static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
 {
-	if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
-		return false;
-
 	lockdep_assert_irqs_disabled();
 
 	if (unlikely(!p)) {
@@ -6310,9 +6200,6 @@  static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 	bool in_balance;
 	unsigned long flags;
 
-	if (!scx_kf_allowed_if_unlocked() && !scx_kf_allowed(SCX_KF_DISPATCH))
-		return false;
-
 	/*
 	 * Can be called from either ops.dispatch() locking this_rq() or any
 	 * context where no rq lock is held. If latter, lock @p's task_rq which
@@ -6395,9 +6282,6 @@  __bpf_kfunc_start_defs();
  */
 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
 {
-	if (!scx_kf_allowed(SCX_KF_DISPATCH))
-		return 0;
-
 	return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
 }
 
@@ -6411,9 +6295,6 @@  __bpf_kfunc void scx_bpf_dispatch_cancel(void)
 {
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 
-	if (!scx_kf_allowed(SCX_KF_DISPATCH))
-		return;
-
 	if (dspc->cursor > 0)
 		dspc->cursor--;
 	else
@@ -6439,9 +6320,6 @@  __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	struct scx_dispatch_q *dsq;
 
-	if (!scx_kf_allowed(SCX_KF_DISPATCH))
-		return false;
-
 	flush_dispatch_buf(dspc->rq);
 
 	dsq = find_user_dsq(dsq_id);
@@ -6632,9 +6510,6 @@  __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
 	struct rq *rq;
 	struct task_struct *p, *n;
 
-	if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
-		return 0;
-
 	rq = cpu_rq(smp_processor_id());
 	lockdep_assert_rq_held(rq);
 
@@ -7239,7 +7114,7 @@  __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
 	struct task_group *tg = p->sched_task_group;
 	struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
 
-	if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
+	if (!scx_kf_allowed_on_arg_tasks(p))
 		goto out;
 
 	cgrp = tg_cgrp(tg);
@@ -7479,10 +7354,6 @@  static int __init scx_init(void)
 	 *
 	 * Some kfuncs are context-sensitive and can only be called from
 	 * specific SCX ops. They are grouped into BTF sets accordingly.
-	 * Unfortunately, BPF currently doesn't have a way of enforcing such
-	 * restrictions. Eventually, the verifier should be able to enforce
-	 * them. For now, register them the same and make each kfunc explicitly
-	 * check using scx_kf_allowed().
 	 */
 	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_ops_context_sensitive)) ||
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index efb6077810d8..e241935021eb 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -658,7 +658,7 @@  void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
 	 * managed by put_prev_task_idle()/set_next_task_idle().
 	 */
 	if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
-		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+		SCX_CALL_OP(update_idle, cpu_of(rq), idle);
 
 	/*
 	 * Update the idle masks:
@@ -803,9 +803,6 @@  __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
 	if (!check_builtin_idle_enabled())
 		goto prev_cpu;
 
-	if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
-		goto prev_cpu;
-
 #ifdef CONFIG_SMP
 	return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
 #endif