@@ -210,6 +210,35 @@ cgroup v2 currently supports the following mount options.
relying on the original semantics (e.g. specifying bogusly
high 'bypass' protection values at higher tree levels).
+ memory_hugetlb_accounting
+ Count HugeTLB memory usage towards the cgroup's overall
+ memory usage for the memory controller (for the purpose of
+ statistics reporting and memory protetion). This is a new
+ behavior that could regress existing setups, so it must be
+ explicitly opted in with this mount option.
+
+ A few caveats to keep in mind:
+
+ * There is no HugeTLB pool management involved in the memory
+ controller. The pre-allocated pool does not belong to anyone.
+ Specifically, when a new HugeTLB folio is allocated to
+ the pool, it is not accounted for from the perspective of the
+ memory controller. It is only charged to a cgroup when it is
+ actually used (for e.g at page fault time). Host memory
+ overcommit management has to consider this when configuring
+ hard limits. In general, HugeTLB pool management should be
+ done via other mechanisms (such as the HugeTLB controller).
+ * Failure to charge a HugeTLB folio to the memory controller
+ results in SIGBUS. This could happen even if the HugeTLB pool
+ still has pages available (but the cgroup limit is hit and
+ reclaim attempt fails).
+ * Charging HugeTLB memory towards the memory controller affects
+ memory protection and reclaim dynamics. Any userspace tuning
+ (of low, min limits for e.g) needs to take this into account.
+ * HugeTLB pages utilized while this option is not selected
+ will not be tracked by the memory controller (even if cgroup
+ v2 is remounted later on).
+
Organizing Processes and Threads
--------------------------------
@@ -115,6 +115,11 @@ enum {
* Enable recursive subtree protection
*/
CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18),
+
+ /*
+ * Enable hugetlb accounting for the memory controller.
+ */
+ CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19),
};
/* cftype->flags */
@@ -679,6 +679,9 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
return __mem_cgroup_charge(folio, mm, gfp);
}
+int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
+ long nr_pages);
+
int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
gfp_t gfp, swp_entry_t entry);
void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
@@ -1264,6 +1267,12 @@ static inline int mem_cgroup_charge(struct folio *folio,
return 0;
}
+static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg,
+ gfp_t gfp, long nr_pages)
+{
+ return 0;
+}
+
static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
{
@@ -1902,6 +1902,7 @@ enum cgroup2_param {
Opt_favordynmods,
Opt_memory_localevents,
Opt_memory_recursiveprot,
+ Opt_memory_hugetlb_accounting,
nr__cgroup2_params
};
@@ -1910,6 +1911,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("favordynmods", Opt_favordynmods),
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
+ fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
{}
};
@@ -1936,6 +1938,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_memory_recursiveprot:
ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
return 0;
+ case Opt_memory_hugetlb_accounting:
+ ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+ return 0;
}
return -EINVAL;
}
@@ -1960,6 +1965,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+
+ if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
}
}
@@ -1973,6 +1983,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
seq_puts(seq, ",memory_localevents");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
seq_puts(seq, ",memory_recursiveprot");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
+ seq_puts(seq, ",memory_hugetlb_accounting");
return 0;
}
@@ -7050,7 +7062,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
"nsdelegate\n"
"favordynmods\n"
"memory_localevents\n"
- "memory_recursiveprot\n");
+ "memory_recursiveprot\n"
+ "memory_hugetlb_accounting\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
@@ -1902,6 +1902,7 @@ void free_huge_folio(struct folio *folio)
pages_per_huge_page(h), folio);
hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
pages_per_huge_page(h), folio);
+ mem_cgroup_uncharge(folio);
if (restore_reserve)
h->resv_huge_pages++;
@@ -3009,11 +3010,20 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
struct hugepage_subpool *spool = subpool_vma(vma);
struct hstate *h = hstate_vma(vma);
struct folio *folio;
- long map_chg, map_commit;
+ long map_chg, map_commit, nr_pages = pages_per_huge_page(h);
long gbl_chg;
- int ret, idx;
+ int memcg_charge_ret, ret, idx;
struct hugetlb_cgroup *h_cg = NULL;
+ struct mem_cgroup *memcg;
bool deferred_reserve;
+ gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
+
+ memcg = get_mem_cgroup_from_current();
+ memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages);
+ if (memcg_charge_ret == -ENOMEM) {
+ mem_cgroup_put(memcg);
+ return ERR_PTR(-ENOMEM);
+ }
idx = hstate_index(h);
/*
@@ -3022,8 +3032,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
* code of zero indicates a reservation exists (no change).
*/
map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
- if (map_chg < 0)
+ if (map_chg < 0) {
+ if (!memcg_charge_ret)
+ mem_cgroup_cancel_charge(memcg, nr_pages);
+ mem_cgroup_put(memcg);
return ERR_PTR(-ENOMEM);
+ }
/*
* Processes that did not create the mapping will have no
@@ -3034,10 +3048,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
*/
if (map_chg || avoid_reserve) {
gbl_chg = hugepage_subpool_get_pages(spool, 1);
- if (gbl_chg < 0) {
- vma_end_reservation(h, vma, addr);
- return ERR_PTR(-ENOSPC);
- }
+ if (gbl_chg < 0)
+ goto out_end_reservation;
/*
* Even though there was no reservation in the region/reserve
@@ -3119,6 +3131,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
pages_per_huge_page(h), folio);
}
+
+ if (!memcg_charge_ret)
+ mem_cgroup_commit_charge(folio, memcg);
+ mem_cgroup_put(memcg);
+
return folio;
out_uncharge_cgroup:
@@ -3130,7 +3147,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
out_subpool_put:
if (map_chg || avoid_reserve)
hugepage_subpool_put_pages(spool, 1);
+out_end_reservation:
vma_end_reservation(h, vma, addr);
+ if (!memcg_charge_ret)
+ mem_cgroup_cancel_charge(memcg, nr_pages);
+ mem_cgroup_put(memcg);
return ERR_PTR(-ENOSPC);
}
@@ -7085,6 +7085,41 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
return ret;
}
+/**
+ * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio
+ * @memcg: memcg to charge.
+ * @gfp: reclaim mode.
+ * @nr_pages: number of pages to charge.
+ *
+ * This function is called when allocating a huge page folio to determine if
+ * the memcg has the capacity for it. It does not commit the charge yet,
+ * as the hugetlb folio itself has not been obtained from the hugetlb pool.
+ *
+ * Once we have obtained the hugetlb folio, we can call
+ * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the
+ * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect
+ * of try_charge().
+ *
+ * Returns 0 on success. Otherwise, an error code is returned.
+ */
+int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
+ long nr_pages)
+{
+ /*
+ * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation,
+ * but do not attempt to commit charge later (or cancel on error) either.
+ */
+ if (mem_cgroup_disabled() || !memcg ||
+ !cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
+ !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
+ return -EOPNOTSUPP;
+
+ if (try_charge(memcg, gfp, nr_pages))
+ return -ENOMEM;
+
+ return 0;
+}
+
/**
* mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
* @folio: folio to charge.
@@ -7354,7 +7389,12 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
return;
memcg = folio_memcg(old);
- VM_WARN_ON_ONCE_FOLIO(!memcg, old);
+ /*
+ * Note that it is normal to see !memcg for a hugetlb folio.
+ * For e.g, itt could have been allocated when memory_hugetlb_accounting
+ * was not selected.
+ */
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old);
if (!memcg)
return;
@@ -633,8 +633,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
folio_copy_owner(newfolio, folio);
- if (!folio_test_hugetlb(folio))
- mem_cgroup_migrate(folio, newfolio);
+ mem_cgroup_migrate(folio, newfolio);
}
EXPORT_SYMBOL(folio_migrate_flags);