@@ -239,6 +239,11 @@ cgroup v2 currently supports the following mount options.
will not be tracked by the memory controller (even if cgroup
v2 is remounted later on).
+ pids_miglimit
+ Apply pids.max limit also when migrating tasks between cgroups. Only
+ new destination limit are taken into account, i.e. if subtree has
+ pids.current > pids.max, migration within that subtree is allowed.
+
Organizing Processes and Threads
--------------------------------
@@ -2204,7 +2209,8 @@ Organisational operations are not blocked by cgroup policies, so it is
possible to have pids.current > pids.max. This can be done by either
setting the limit to be smaller than pids.current, or attaching enough
processes to the cgroup such that pids.current is larger than
-pids.max. However, it is not possible to violate a cgroup PID policy
+pids.max (unless pids_miglimit mount options is given).
+However, it is not possible to violate a cgroup PID policy
through fork() or clone(). These will return -EAGAIN if the creation
of a new process would cause a cgroup policy to be violated.
@@ -119,7 +119,12 @@ enum {
/*
* Enable hugetlb accounting for the memory controller.
*/
- CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
+ CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
+
+ /*
+ * Enforce pids limit upon task migration
+ */
+ CGRP_ROOT_PIDS_MIGRATION_LIMIT = (1 << 20),
};
/* cftype->flags */
@@ -1922,6 +1922,7 @@ enum cgroup2_param {
Opt_memory_localevents,
Opt_memory_recursiveprot,
Opt_memory_hugetlb_accounting,
+ Opt_pids_miglimit,
nr__cgroup2_params
};
@@ -1931,6 +1932,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
+ fsparam_flag("pids_miglimit", Opt_pids_miglimit),
{}
};
@@ -1960,6 +1962,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_memory_hugetlb_accounting:
ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
return 0;
+ case Opt_pids_miglimit:
+ ctx->flags |= CGRP_ROOT_PIDS_MIGRATION_LIMIT;
+ return 0;
}
return -EINVAL;
}
@@ -1989,6 +1994,12 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+
+ if (root_flags & CGRP_ROOT_PIDS_MIGRATION_LIMIT)
+ cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_MIGRATION_LIMIT;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_MIGRATION_LIMIT;
+
}
}
@@ -2004,6 +2015,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
seq_puts(seq, ",memory_recursiveprot");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
seq_puts(seq, ",memory_hugetlb_accounting");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_MIGRATION_LIMIT)
+ seq_puts(seq, ",pids_miglimit");
return 0;
}
@@ -7061,7 +7074,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
"favordynmods\n"
"memory_localevents\n"
"memory_recursiveprot\n"
- "memory_hugetlb_accounting\n");
+ "memory_hugetlb_accounting\n"
+ "pids_miglimit\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
@@ -217,6 +217,7 @@ static int pids_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *dst_css;
+ int err, ret = 0;
cgroup_taskset_for_each(task, dst_css, tset) {
struct pids_cgroup *pids = css_pids(dst_css);
@@ -231,10 +232,13 @@ static int pids_can_attach(struct cgroup_taskset *tset)
old_css = task_css(task, pids_cgrp_id);
old_pids = css_pids(old_css);
- (void) pids_tranfer_charge(old_pids, pids, 1);
+ err = pids_tranfer_charge(old_pids, pids, 1);
+
+ if (!ret && (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_MIGRATION_LIMIT))
+ ret = err;
}
- return 0;
+ return ret;
}
static void pids_cancel_attach(struct cgroup_taskset *tset)
While pids controller is designed with only forks in mind, it leads to situations where limit is apparently ineffective. A manager daemon is in /src and it spawns tasks into /dst. The administrator sets up a limit dst/pids.max while src/pids.max is unlimited. The manager daemon can spawn more than dst/pids.max tasks because they get into their target cgroup via migration (or CLONE_INTO_CGROUP). For this (migration) to work both src and dst must be in the same resource domain so the manager daemon does not honor the limit which is under its control anyway and no excessive resource consumption happens. dst/pids.current > dst/pids.max may come as a surprise when the spawning mechanism is opaque to the administrator of dst/pids.max. Change the behavior of pids controller to take into account limits of target cgroup upon migration (but only below common ancestor src and dst, pids.current of common ancestor and above is not affected by migration, so deliberatly ignore pre-existing pids.current > pids.max). This change of behavior is hidden behind cgroup2 mount option and the default is unchanged, pids.max won't affect migrations. Signed-off-by: Michal Koutný <mkoutny@suse.com> --- Documentation/admin-guide/cgroup-v2.rst | 8 +++++++- include/linux/cgroup-defs.h | 7 ++++++- kernel/cgroup/cgroup.c | 16 +++++++++++++++- kernel/cgroup/pids.c | 8 ++++++-- 4 files changed, 34 insertions(+), 5 deletions(-)