diff mbox series

[v2,RFC] sched: only perform capability check on privileged operation

Message ID 20220502152414.110922-1-cgzones@googlemail.com (mailing list archive)
State Handled Elsewhere
Delegated to: Paul Moore
Headers show
Series [v2,RFC] sched: only perform capability check on privileged operation | expand

Commit Message

Christian Göttsche May 2, 2022, 3:24 p.m. UTC
sched_setattr(2) issues via kernel/sched/core.c:__sched_setscheduler()
a CAP_SYS_NICE audit event unconditionally, even when the requested
operation does not require that capability / is unprivileged, i.e. for
reducing niceness.
This is relevant in connection with SELinux, where a capability check
results in a policy decision and by default a denial message on
insufficient permission is issued.
It can lead to three undesired cases:
  1. A denial message is generated, even in case the operation was an
     unprivileged one and thus the syscall succeeded, creating noise.
  2. To avoid the noise from 1. the policy writer adds a rule to ignore
     those denial messages, hiding future syscalls, where the task
     performs an actual privileged operation, leading to hidden limited
     functionality of that task.
  3. To avoid the noise from 1. the policy writer adds a rule to allow
     the task the capability CAP_SYS_NICE, while it does not need it,
     violating the principle of least privilege.

Conduct privilged/unprivileged categorization first and perform a
capable test (and at most once) only if needed.

Signed-off-by: Christian Göttsche <cgzones@googlemail.com>

---
v2:
  add is_nice_reduction() to avoid duplicate capable(CAP_SYS_NICE)
  checks via can_nice()
---
 kernel/sched/core.c | 135 +++++++++++++++++++++++++++-----------------
 1 file changed, 84 insertions(+), 51 deletions(-)

Comments

Peter Zijlstra May 4, 2022, 2:23 p.m. UTC | #1
On Mon, May 02, 2022 at 05:24:14PM +0200, Christian Göttsche wrote:
> sched_setattr(2) issues via kernel/sched/core.c:__sched_setscheduler()
> a CAP_SYS_NICE audit event unconditionally, even when the requested
> operation does not require that capability / is unprivileged, i.e. for
> reducing niceness.
> This is relevant in connection with SELinux, where a capability check
> results in a policy decision and by default a denial message on
> insufficient permission is issued.
> It can lead to three undesired cases:
>   1. A denial message is generated, even in case the operation was an
>      unprivileged one and thus the syscall succeeded, creating noise.
>   2. To avoid the noise from 1. the policy writer adds a rule to ignore
>      those denial messages, hiding future syscalls, where the task
>      performs an actual privileged operation, leading to hidden limited
>      functionality of that task.
>   3. To avoid the noise from 1. the policy writer adds a rule to allow
>      the task the capability CAP_SYS_NICE, while it does not need it,
>      violating the principle of least privilege.
> 
> Conduct privilged/unprivileged categorization first and perform a
> capable test (and at most once) only if needed.
> 
> Signed-off-by: Christian Göttsche <cgzones@googlemail.com>

Does something like so on top work?

---
 kernel/sched/core.c | 41 +++++++++++++++++------------------------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ba5a9a1ce1e5..d3b5a2757c5f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6931,17 +6931,27 @@ void set_user_nice(struct task_struct *p, long nice)
 EXPORT_SYMBOL(set_user_nice);
 
 /*
- * can_nice - check if a task can reduce its nice value
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
  * @p: task
  * @nice: nice value
  */
-int can_nice(const struct task_struct *p, const int nice)
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
 {
 	/* Convert nice value [19,-20] to rlimit style value [1,40]: */
 	int nice_rlim = nice_to_rlimit(nice);
 
-	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
-		capable(CAP_SYS_NICE));
+	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+	return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
 }
 
 #ifdef __ARCH_WANT_SYS_NICE
@@ -7220,22 +7230,6 @@ static bool check_same_owner(struct task_struct *p)
 	return match;
 }
 
-/*
- * is_nice_reduction - check if nice value is an actual reduction
- *
- * Similar to can_nice() but does not perform a capability check.
- *
- * @p: task
- * @nice: nice value
- */
-static bool is_nice_reduction(const struct task_struct *p, const int nice)
-{
-	/* Convert nice value [19,-20] to rlimit style value [1,40]: */
-	int nice_rlim = nice_to_rlimit(nice);
-
-	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
-}
-
 /*
  * Allow unprivileged RT tasks to decrease priority.
  * Only issue a capable test if needed and only once to avoid an audit
@@ -7247,13 +7241,12 @@ static int user_check_sched_setscheduler(struct task_struct *p,
 {
 	if (fair_policy(policy)) {
 		if (attr->sched_nice < task_nice(p) &&
-			!is_nice_reduction(p, attr->sched_nice))
+		    !is_nice_reduction(p, attr->sched_nice))
 			goto req_priv;
 	}
 
 	if (rt_policy(policy)) {
-		unsigned long rlim_rtprio =
-		task_rlimit(p, RLIMIT_RTPRIO);
+		unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
 
 		/* Can't set/change the rt policy: */
 		if (policy != p->policy && !rlim_rtprio)
@@ -7261,7 +7254,7 @@ static int user_check_sched_setscheduler(struct task_struct *p,
 
 		/* Can't increase priority: */
 		if (attr->sched_priority > p->rt_priority &&
-			attr->sched_priority > rlim_rtprio)
+		    attr->sched_priority > rlim_rtprio)
 			goto req_priv;
 	}
diff mbox series

Patch

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d575b4914925..b9c1e67af46f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7211,6 +7211,86 @@  static bool check_same_owner(struct task_struct *p)
 	return match;
 }
 
+/*
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
+ * Similar to can_nice() but does not perform a capability check.
+ *
+ * @p: task
+ * @nice: nice value
+ */
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
+{
+	/* Convert nice value [19,-20] to rlimit style value [1,40]: */
+	int nice_rlim = nice_to_rlimit(nice);
+
+	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed and only once to avoid an audit
+ * event on permitted non-privileged operations:
+ */
+static int user_check_sched_setscheduler(struct task_struct *p,
+					 const struct sched_attr *attr,
+					 int policy, int reset_on_fork)
+{
+	if (fair_policy(policy)) {
+		if (attr->sched_nice < task_nice(p) &&
+			!is_nice_reduction(p, attr->sched_nice))
+			goto req_priv;
+	}
+
+	if (rt_policy(policy)) {
+		unsigned long rlim_rtprio =
+		task_rlimit(p, RLIMIT_RTPRIO);
+
+		/* Can't set/change the rt policy: */
+		if (policy != p->policy && !rlim_rtprio)
+			goto req_priv;
+
+		/* Can't increase priority: */
+		if (attr->sched_priority > p->rt_priority &&
+			attr->sched_priority > rlim_rtprio)
+			goto req_priv;
+	}
+
+	/*
+	 * Can't set/change SCHED_DEADLINE policy at all for now
+	 * (safest behavior); in the future we would like to allow
+	 * unprivileged DL tasks to increase their relative deadline
+	 * or reduce their runtime (both ways reducing utilization)
+	 */
+	if (dl_policy(policy))
+		goto req_priv;
+
+	/*
+	 * Treat SCHED_IDLE as nice 20. Only allow a switch to
+	 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+	 */
+	if (task_has_idle_policy(p) && !idle_policy(policy)) {
+		if (!is_nice_reduction(p, task_nice(p)))
+			goto req_priv;
+	}
+
+	/* Can't change other user's priorities: */
+	if (!check_same_owner(p))
+		goto req_priv;
+
+	/* Normal users shall not reset the sched_reset_on_fork flag: */
+	if (p->sched_reset_on_fork && !reset_on_fork)
+		goto req_priv;
+
+	return 0;
+
+req_priv:
+	if (!capable(CAP_SYS_NICE))
+		return -EPERM;
+
+	return 0;
+}
+
 static int __sched_setscheduler(struct task_struct *p,
 				const struct sched_attr *attr,
 				bool user, bool pi)
@@ -7252,58 +7332,11 @@  static int __sched_setscheduler(struct task_struct *p,
 	    (rt_policy(policy) != (attr->sched_priority != 0)))
 		return -EINVAL;
 
-	/*
-	 * Allow unprivileged RT tasks to decrease priority:
-	 */
-	if (user && !capable(CAP_SYS_NICE)) {
-		if (fair_policy(policy)) {
-			if (attr->sched_nice < task_nice(p) &&
-			    !can_nice(p, attr->sched_nice))
-				return -EPERM;
-		}
-
-		if (rt_policy(policy)) {
-			unsigned long rlim_rtprio =
-					task_rlimit(p, RLIMIT_RTPRIO);
-
-			/* Can't set/change the rt policy: */
-			if (policy != p->policy && !rlim_rtprio)
-				return -EPERM;
-
-			/* Can't increase priority: */
-			if (attr->sched_priority > p->rt_priority &&
-			    attr->sched_priority > rlim_rtprio)
-				return -EPERM;
-		}
-
-		 /*
-		  * Can't set/change SCHED_DEADLINE policy at all for now
-		  * (safest behavior); in the future we would like to allow
-		  * unprivileged DL tasks to increase their relative deadline
-		  * or reduce their runtime (both ways reducing utilization)
-		  */
-		if (dl_policy(policy))
-			return -EPERM;
-
-		/*
-		 * Treat SCHED_IDLE as nice 20. Only allow a switch to
-		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
-		 */
-		if (task_has_idle_policy(p) && !idle_policy(policy)) {
-			if (!can_nice(p, task_nice(p)))
-				return -EPERM;
-		}
-
-		/* Can't change other user's priorities: */
-		if (!check_same_owner(p))
-			return -EPERM;
-
-		/* Normal users shall not reset the sched_reset_on_fork flag: */
-		if (p->sched_reset_on_fork && !reset_on_fork)
-			return -EPERM;
-	}
-
 	if (user) {
+		retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
+		if (retval)
+			return retval;
+
 		if (attr->sched_flags & SCHED_FLAG_SUGOV)
 			return -EINVAL;