new file mode 100644
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_CGROUP_H
+#define _LINUX_SCHED_CGROUP_H
+
+#include <linux/cgroup-defs.h>
+#include <linux/cpumask.h>
+
+#ifdef CONFIG_CFS_BANDWIDTH
+
+int max_cfs_bandwidth_cpus(struct cgroup_subsys_state *css);
+
+#else /* CONFIG_CFS_BANDWIDTH */
+
+static inline int max_cfs_bandwidth_cpus(struct cgroup_subsys_state *css)
+{
+ return nr_cpu_ids;
+}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+#endif /* _LINUX_SCHED_CGROUP_H */
@@ -24,6 +24,7 @@
* 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*/
+#include <linux/cgroup.h>
#include <linux/completion.h>
#include <linux/export.h>
#include <linux/cpumask.h>
@@ -34,6 +35,7 @@
#include <linux/padata.h>
#include <linux/mutex.h>
#include <linux/sched.h>
+#include <linux/sched/cgroup.h>
#include <linux/slab.h>
#include <linux/sysfs.h>
#include <linux/rcupdate.h>
@@ -572,6 +574,7 @@ int padata_do_multithreaded_job(struct padata_mt_job *job,
{
/* In case threads finish at different times. */
static const unsigned long load_balance_factor = 4;
+ struct cgroup_subsys_state *cpu_css;
struct padata_work my_work, *pw;
struct padata_mt_job_state ps;
LIST_HEAD(works);
@@ -585,6 +588,18 @@ int padata_do_multithreaded_job(struct padata_mt_job *job,
nworks = min(nworks, job->max_threads);
nworks = min(nworks, current->nr_cpus_allowed);
+#ifdef CONFIG_CGROUP_SCHED
+ /*
+ * Cap threads at the max number of CPUs current's CFS bandwidth
+ * settings allow. Keep it simple, don't try to keep this value up to
+ * date. The ifdef guards cpu_cgrp_id.
+ */
+ rcu_read_lock();
+ cpu_css = task_css(current, cpu_cgrp_id);
+ nworks = min(nworks, max_cfs_bandwidth_cpus(cpu_css));
+ rcu_read_unlock();
+#endif
+
if (nworks == 1) {
/* Single thread, no coordination needed, cut to the chase. */
return job->thread_fn(job->start, job->start + job->size,
@@ -10021,6 +10021,25 @@ static long tg_get_cfs_burst(struct task_group *tg)
return burst_us;
}
+/* Returns the max whole number of CPUs that @css's bandwidth settings allow. */
+int max_cfs_bandwidth_cpus(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ u64 quota_us, period_us;
+
+ if (tg == &root_task_group)
+ return nr_cpu_ids;
+
+ quota_us = tg_get_cfs_quota(tg);
+
+ if (quota_us == RUNTIME_INF)
+ return nr_cpu_ids;
+
+ period_us = tg_get_cfs_period(tg);
+
+ return quota_us / period_us;
+}
+
static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
Helpers are currently not bound by the main thread's CFS bandwidth limits because they're kernel threads that run on the root runqueues, so a multithreaded job could cause a task group to consume more quota than it's configured for. As a starting point for helpers honoring these limits, restrict a job to only as many helpers as there are CPUs allowed by these limits. Helpers are generally CPU-bound, so starting more helpers than this would likely exceed the group's entire quota. Max CFS bandwidth CPUs are calculated conservatively with integer division (quota / period). This restriction ignores other tasks in the group that might also be consuming quota, so it doesn't strictly prevent a group from exceeding its limits. However, this may be the right tradeoff between simplicity and absolutely correct resource control, given that VFIO page pinning typically happens during guest initialization when there's not much other CPU activity in the group. There's also a prototype for an absolutely correct approach later in the series should that be preferred. Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com> --- include/linux/sched/cgroup.h | 21 +++++++++++++++++++++ kernel/padata.c | 15 +++++++++++++++ kernel/sched/core.c | 19 +++++++++++++++++++ 3 files changed, 55 insertions(+) create mode 100644 include/linux/sched/cgroup.h