diff mbox series

hugetlb: force allocating surplus hugepages on mempolicy allowed nodes

Message ID 20240621190050.mhxwb65zn37doegp@redhat.com (mailing list archive)
State New
Headers show
Series hugetlb: force allocating surplus hugepages on mempolicy allowed nodes | expand

Commit Message

Aristeu Rozanski June 21, 2024, 7 p.m. UTC
allowed_mems_nr() only accounts for the number of free hugepages in the nodes
the current process belongs to. In case one or more of the requested surplus
hugepages are allocated in a different node, the whole allocation will fail due
allowed_mems_nr() returning a lower value.

So allocate surplus hugepages in one of the nodes the current process belongs to.

Easy way to reproduce this issue is to use a 2+ NUMA nodes system:

		# echo 0 >/proc/sys/vm/nr_hugepages
		# echo 1 >/proc/sys/vm/nr_overcommit_hugepages
		# numactl -m0 ./tools/testing/selftests/mm/map_hugetlb 2

will eventually fail when the hugepage ends up allocated in a different node.

Cc: Muchun Song <muchun.song@linux.dev>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Aristeu Rozanski <aris@redhat.com>

---
 mm/hugetlb.c |   46 +++++++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 19 deletions(-)

Comments

Andrew Morton June 22, 2024, 12:56 a.m. UTC | #1
On Fri, 21 Jun 2024 15:00:50 -0400 Aristeu Rozanski <aris@redhat.com> wrote:

> allowed_mems_nr() only accounts for the number of free hugepages in the nodes
> the current process belongs to. In case one or more of the requested surplus
> hugepages are allocated in a different node, the whole allocation will fail due
> allowed_mems_nr() returning a lower value.
> 
> So allocate surplus hugepages in one of the nodes the current process belongs to.
> 
> Easy way to reproduce this issue is to use a 2+ NUMA nodes system:
> 
> 		# echo 0 >/proc/sys/vm/nr_hugepages
> 		# echo 1 >/proc/sys/vm/nr_overcommit_hugepages
> 		# numactl -m0 ./tools/testing/selftests/mm/map_hugetlb 2
> 
> will eventually fail when the hugepage ends up allocated in a different node.
> 

Should we backport this into -stable kernels?
Andrew Morton June 25, 2024, 10:54 p.m. UTC | #2
On Fri, 21 Jun 2024 17:56:09 -0700 Andrew Morton <akpm@linux-foundation.org> wrote:

> On Fri, 21 Jun 2024 15:00:50 -0400 Aristeu Rozanski <aris@redhat.com> wrote:
> 
> > allowed_mems_nr() only accounts for the number of free hugepages in the nodes
> > the current process belongs to. In case one or more of the requested surplus
> > hugepages are allocated in a different node, the whole allocation will fail due
> > allowed_mems_nr() returning a lower value.
> > 
> > So allocate surplus hugepages in one of the nodes the current process belongs to.
> > 
> > Easy way to reproduce this issue is to use a 2+ NUMA nodes system:
> > 
> > 		# echo 0 >/proc/sys/vm/nr_hugepages
> > 		# echo 1 >/proc/sys/vm/nr_overcommit_hugepages
> > 		# numactl -m0 ./tools/testing/selftests/mm/map_hugetlb 2
> > 
> > will eventually fail when the hugepage ends up allocated in a different node.
> > 
> 
> Should we backport this into -stable kernels?

Please?

Aristeu, a description of the userspace-visible effects of the bug will
really help things along here.
diff mbox series

Patch

--- upstream.orig/mm/hugetlb.c	2024-06-20 13:42:25.699568114 -0400
+++ upstream/mm/hugetlb.c	2024-06-21 14:52:49.970798299 -0400
@@ -2618,6 +2618,23 @@  struct folio *alloc_hugetlb_folio_nodema
 	return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
 }
 
+static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
+{
+#ifdef CONFIG_NUMA
+	struct mempolicy *mpol = get_task_policy(current);
+
+	/*
+	 * Only enforce MPOL_BIND policy which overlaps with cpuset policy
+	 * (from policy_nodemask) specifically for hugetlb case
+	 */
+	if (mpol->mode == MPOL_BIND &&
+		(apply_policy_zone(mpol, gfp_zone(gfp)) &&
+		 cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
+		return &mpol->nodes;
+#endif
+	return NULL;
+}
+
 /*
  * Increase the hugetlb pool such that it can accommodate a reservation
  * of size 'delta'.
@@ -2631,6 +2648,8 @@  static int gather_surplus_pages(struct h
 	long i;
 	long needed, allocated;
 	bool alloc_ok = true;
+	int node;
+	nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
 
 	lockdep_assert_held(&hugetlb_lock);
 	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
@@ -2645,8 +2664,14 @@  allocated = 0;
 retry:
 	spin_unlock_irq(&hugetlb_lock);
 	for (i = 0; i < needed; i++) {
-		folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
-				NUMA_NO_NODE, NULL);
+		for_each_node_mask(node, cpuset_current_mems_allowed) {
+			if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) {
+				folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
+						node, NULL);
+				if (folio)
+					break;
+			}
+		}
 		if (!folio) {
 			alloc_ok = false;
 			break;
@@ -4876,23 +4901,6 @@  default_hstate_max_huge_pages = 0;
 }
 __setup("default_hugepagesz=", default_hugepagesz_setup);
 
-static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
-{
-#ifdef CONFIG_NUMA
-	struct mempolicy *mpol = get_task_policy(current);
-
-	/*
-	 * Only enforce MPOL_BIND policy which overlaps with cpuset policy
-	 * (from policy_nodemask) specifically for hugetlb case
-	 */
-	if (mpol->mode == MPOL_BIND &&
-		(apply_policy_zone(mpol, gfp_zone(gfp)) &&
-		 cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
-		return &mpol->nodes;
-#endif
-	return NULL;
-}
-
 static unsigned int allowed_mems_nr(struct hstate *h)
 {
 	int node;