diff mbox series

[RFC,02/14] mm/khugepaged: add struct collapse_control

Message ID 20220308213417.1407042-3-zokeefe@google.com (mailing list archive)
State New
Headers show
Series mm: userspace hugepage collapse | expand

Commit Message

Zach O'Keefe March 8, 2022, 9:34 p.m. UTC
Modularize huge page collapse by introducing struct collapse_control.
This structure serves to describe the properties of the requested
collapse, as well as serve as a local scratch pad to use during the
collapse itself.

Later in the series when we introduce the madvise collapse context, we
will want to be able to ignore khugepaged_max_ptes_[none|swap|shared]
in said context, and so is included here as a property of the
requested collapse.

Signed-off-by: Zach O'Keefe <zokeefe@google.com>
---
 mm/khugepaged.c | 120 ++++++++++++++++++++++++++++++------------------
 1 file changed, 76 insertions(+), 44 deletions(-)

Comments

Yang Shi March 9, 2022, 10:53 p.m. UTC | #1
On Tue, Mar 8, 2022 at 1:34 PM Zach O'Keefe <zokeefe@google.com> wrote:
>
> Modularize huge page collapse by introducing struct collapse_control.
> This structure serves to describe the properties of the requested
> collapse, as well as serve as a local scratch pad to use during the
> collapse itself.
>
> Later in the series when we introduce the madvise collapse context, we
> will want to be able to ignore khugepaged_max_ptes_[none|swap|shared]
> in said context, and so is included here as a property of the
> requested collapse.
>
> Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> ---
>  mm/khugepaged.c | 120 ++++++++++++++++++++++++++++++------------------
>  1 file changed, 76 insertions(+), 44 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index a4e5eaf3eb01..36fc0099c445 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -85,6 +85,24 @@ static struct kmem_cache *mm_slot_cache __read_mostly;
>
>  #define MAX_PTE_MAPPED_THP 8
>
> +struct collapse_control {
> +       /* Respect khugepaged_max_ptes_[none|swap|shared] */
> +       bool enforce_pte_scan_limits;

I'm fine to have collapse_control struct, but it seems
enforce_pte_scan_limits is actually not used until a later patch. So
as patch #1, it'd better to have new functions or new variables in the
same patch with their users.

> +
> +       /* Num pages scanned per node */
> +       int node_load[MAX_NUMNODES];
> +
> +       /* Last target selected in khugepaged_find_target_node() for this scan */
> +       int last_target_node;
> +};
> +
> +static void collapse_control_init(struct collapse_control *cc,
> +                                 bool enforce_pte_scan_limits)
> +{
> +       cc->enforce_pte_scan_limits = enforce_pte_scan_limits;
> +       cc->last_target_node = NUMA_NO_NODE;
> +}
> +
>  /**
>   * struct mm_slot - hash lookup from mm to mm_slot
>   * @hash: hash collision list
> @@ -601,6 +619,7 @@ static bool is_refcount_suitable(struct page *page)
>  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                                         unsigned long address,
>                                         pte_t *pte,
> +                                       bool enforce_pte_scan_limits,
>                                         struct list_head *compound_pagelist)
>  {
>         struct page *page = NULL;
> @@ -614,7 +633,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                 if (pte_none(pteval) || (pte_present(pteval) &&
>                                 is_zero_pfn(pte_pfn(pteval)))) {
>                         if (!userfaultfd_armed(vma) &&
> -                           ++none_or_zero <= khugepaged_max_ptes_none) {
> +                           (++none_or_zero <= khugepaged_max_ptes_none ||
> +                            !enforce_pte_scan_limits)) {
>                                 continue;
>                         } else {
>                                 result = SCAN_EXCEED_NONE_PTE;
> @@ -634,8 +654,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>
>                 VM_BUG_ON_PAGE(!PageAnon(page), page);
>
> -               if (page_mapcount(page) > 1 &&
> -                               ++shared > khugepaged_max_ptes_shared) {
> +               if (page_mapcount(page) > 1 && enforce_pte_scan_limits &&
> +                   ++shared > khugepaged_max_ptes_shared) {
>                         result = SCAN_EXCEED_SHARED_PTE;
>                         count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
>                         goto out;
> @@ -785,9 +805,7 @@ static void khugepaged_alloc_sleep(void)
>         remove_wait_queue(&khugepaged_wait, &wait);
>  }
>
> -static int khugepaged_node_load[MAX_NUMNODES];
> -
> -static bool khugepaged_scan_abort(int nid)
> +static bool khugepaged_scan_abort(int nid, struct collapse_control *cc)
>  {
>         int i;
>
> @@ -799,11 +817,11 @@ static bool khugepaged_scan_abort(int nid)
>                 return false;
>
>         /* If there is a count for this node already, it must be acceptable */
> -       if (khugepaged_node_load[nid])
> +       if (cc->node_load[nid])
>                 return false;
>
>         for (i = 0; i < MAX_NUMNODES; i++) {
> -               if (!khugepaged_node_load[i])
> +               if (!cc->node_load[i])
>                         continue;
>                 if (node_distance(nid, i) > node_reclaim_distance)
>                         return true;
> @@ -818,28 +836,28 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
>  }
>
>  #ifdef CONFIG_NUMA
> -static int khugepaged_find_target_node(void)
> +static int khugepaged_find_target_node(struct collapse_control *cc)
>  {
> -       static int last_khugepaged_target_node = NUMA_NO_NODE;
>         int nid, target_node = 0, max_value = 0;
>
>         /* find first node with max normal pages hit */
>         for (nid = 0; nid < MAX_NUMNODES; nid++)
> -               if (khugepaged_node_load[nid] > max_value) {
> -                       max_value = khugepaged_node_load[nid];
> +               if (cc->node_load[nid] > max_value) {
> +                       max_value = cc->node_load[nid];
>                         target_node = nid;
>                 }
>
>         /* do some balance if several nodes have the same hit record */
> -       if (target_node <= last_khugepaged_target_node)
> -               for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
> -                               nid++)
> -                       if (max_value == khugepaged_node_load[nid]) {
> +       if (target_node <= cc->last_target_node)
> +               for (nid = cc->last_target_node + 1; nid < MAX_NUMNODES;
> +                    nid++) {
> +                       if (max_value == cc->node_load[nid]) {
>                                 target_node = nid;
>                                 break;
>                         }
> +               }
>
> -       last_khugepaged_target_node = target_node;
> +       cc->last_target_node = target_node;
>         return target_node;
>  }
>
> @@ -877,7 +895,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
>         return *hpage;
>  }
>  #else
> -static int khugepaged_find_target_node(void)
> +static int khugepaged_find_target_node(struct collapse_control *cc)
>  {
>         return 0;
>  }
> @@ -1043,7 +1061,8 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>  static void collapse_huge_page(struct mm_struct *mm,
>                                    unsigned long address,
>                                    struct page **hpage,
> -                                  int node, int referenced, int unmapped)
> +                                  int node, int referenced, int unmapped,
> +                                  int enforce_pte_scan_limits)
>  {
>         LIST_HEAD(compound_pagelist);
>         pmd_t *pmd, _pmd;
> @@ -1141,7 +1160,7 @@ static void collapse_huge_page(struct mm_struct *mm,
>
>         spin_lock(pte_ptl);
>         isolated = __collapse_huge_page_isolate(vma, address, pte,
> -                       &compound_pagelist);
> +                       enforce_pte_scan_limits, &compound_pagelist);
>         spin_unlock(pte_ptl);
>
>         if (unlikely(!isolated)) {
> @@ -1206,7 +1225,8 @@ static void collapse_huge_page(struct mm_struct *mm,
>  static int khugepaged_scan_pmd(struct mm_struct *mm,
>                                struct vm_area_struct *vma,
>                                unsigned long address,
> -                              struct page **hpage)
> +                              struct page **hpage,
> +                              struct collapse_control *cc)
>  {
>         pmd_t *pmd;
>         pte_t *pte, *_pte;
> @@ -1226,13 +1246,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>                 goto out;
>         }
>
> -       memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
> +       memset(cc->node_load, 0, sizeof(cc->node_load));
>         pte = pte_offset_map_lock(mm, pmd, address, &ptl);
>         for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
>              _pte++, _address += PAGE_SIZE) {
>                 pte_t pteval = *_pte;
>                 if (is_swap_pte(pteval)) {
> -                       if (++unmapped <= khugepaged_max_ptes_swap) {
> +                       if (++unmapped <= khugepaged_max_ptes_swap ||
> +                           !cc->enforce_pte_scan_limits) {
>                                 /*
>                                  * Always be strict with uffd-wp
>                                  * enabled swap entries.  Please see
> @@ -1251,7 +1272,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>                 }
>                 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
>                         if (!userfaultfd_armed(vma) &&
> -                           ++none_or_zero <= khugepaged_max_ptes_none) {
> +                           (++none_or_zero <= khugepaged_max_ptes_none ||
> +                            !cc->enforce_pte_scan_limits)) {
>                                 continue;
>                         } else {
>                                 result = SCAN_EXCEED_NONE_PTE;
> @@ -1282,7 +1304,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>                 }
>
>                 if (page_mapcount(page) > 1 &&
> -                               ++shared > khugepaged_max_ptes_shared) {
> +                               ++shared > khugepaged_max_ptes_shared &&
> +                               cc->enforce_pte_scan_limits) {
>                         result = SCAN_EXCEED_SHARED_PTE;
>                         count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
>                         goto out_unmap;
> @@ -1292,16 +1315,16 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>
>                 /*
>                  * Record which node the original page is from and save this
> -                * information to khugepaged_node_load[].
> +                * information to cc->node_load[].
>                  * Khugepaged will allocate hugepage from the node has the max
>                  * hit record.
>                  */
>                 node = page_to_nid(page);
> -               if (khugepaged_scan_abort(node)) {
> +               if (khugepaged_scan_abort(node, cc)) {
>                         result = SCAN_SCAN_ABORT;
>                         goto out_unmap;
>                 }
> -               khugepaged_node_load[node]++;
> +               cc->node_load[node]++;
>                 if (!PageLRU(page)) {
>                         result = SCAN_PAGE_LRU;
>                         goto out_unmap;
> @@ -1352,10 +1375,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>  out_unmap:
>         pte_unmap_unlock(pte, ptl);
>         if (ret) {
> -               node = khugepaged_find_target_node();
> +               node = khugepaged_find_target_node(cc);
>                 /* collapse_huge_page will return with the mmap_lock released */
>                 collapse_huge_page(mm, address, hpage, node,
> -                               referenced, unmapped);
> +                               referenced, unmapped,
> +                               cc->enforce_pte_scan_limits);
>         }
>  out:
>         trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
> @@ -1992,7 +2016,8 @@ static void collapse_file(struct mm_struct *mm,
>  }
>
>  static void khugepaged_scan_file(struct mm_struct *mm,
> -               struct file *file, pgoff_t start, struct page **hpage)
> +               struct file *file, pgoff_t start, struct page **hpage,
> +               struct collapse_control *cc)
>  {
>         struct page *page = NULL;
>         struct address_space *mapping = file->f_mapping;
> @@ -2003,14 +2028,15 @@ static void khugepaged_scan_file(struct mm_struct *mm,
>
>         present = 0;
>         swap = 0;
> -       memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
> +       memset(cc->node_load, 0, sizeof(cc->node_load));
>         rcu_read_lock();
>         xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
>                 if (xas_retry(&xas, page))
>                         continue;
>
>                 if (xa_is_value(page)) {
> -                       if (++swap > khugepaged_max_ptes_swap) {
> +                       if (cc->enforce_pte_scan_limits &&
> +                           ++swap > khugepaged_max_ptes_swap) {
>                                 result = SCAN_EXCEED_SWAP_PTE;
>                                 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
>                                 break;
> @@ -2028,11 +2054,11 @@ static void khugepaged_scan_file(struct mm_struct *mm,
>                 }
>
>                 node = page_to_nid(page);
> -               if (khugepaged_scan_abort(node)) {
> +               if (khugepaged_scan_abort(node, cc)) {
>                         result = SCAN_SCAN_ABORT;
>                         break;
>                 }
> -               khugepaged_node_load[node]++;
> +               cc->node_load[node]++;
>
>                 if (!PageLRU(page)) {
>                         result = SCAN_PAGE_LRU;
> @@ -2061,11 +2087,12 @@ static void khugepaged_scan_file(struct mm_struct *mm,
>         rcu_read_unlock();
>
>         if (result == SCAN_SUCCEED) {
> -               if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
> +               if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none &&
> +                   cc->enforce_pte_scan_limits) {
>                         result = SCAN_EXCEED_NONE_PTE;
>                         count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
>                 } else {
> -                       node = khugepaged_find_target_node();
> +                       node = khugepaged_find_target_node(cc);
>                         collapse_file(mm, file, start, hpage, node);
>                 }
>         }
> @@ -2074,7 +2101,8 @@ static void khugepaged_scan_file(struct mm_struct *mm,
>  }
>  #else
>  static void khugepaged_scan_file(struct mm_struct *mm,
> -               struct file *file, pgoff_t start, struct page **hpage)
> +               struct file *file, pgoff_t start, struct page **hpage,
> +               struct collapse_control *cc)
>  {
>         BUILD_BUG();
>  }
> @@ -2085,7 +2113,8 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
>  #endif
>
>  static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
> -                                           struct page **hpage)
> +                                           struct page **hpage,
> +                                           struct collapse_control *cc)
>         __releases(&khugepaged_mm_lock)
>         __acquires(&khugepaged_mm_lock)
>  {
> @@ -2161,12 +2190,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
>
>                                 mmap_read_unlock(mm);
>                                 ret = 1;
> -                               khugepaged_scan_file(mm, file, pgoff, hpage);
> +                               khugepaged_scan_file(mm, file, pgoff, hpage, cc);
>                                 fput(file);
>                         } else {
>                                 ret = khugepaged_scan_pmd(mm, vma,
>                                                 khugepaged_scan.address,
> -                                               hpage);
> +                                               hpage, cc);
>                         }
>                         /* move to next address */
>                         khugepaged_scan.address += HPAGE_PMD_SIZE;
> @@ -2222,7 +2251,7 @@ static int khugepaged_wait_event(void)
>                 kthread_should_stop();
>  }
>
> -static void khugepaged_do_scan(void)
> +static void khugepaged_do_scan(struct collapse_control *cc)
>  {
>         struct page *hpage = NULL;
>         unsigned int progress = 0, pass_through_head = 0;
> @@ -2246,7 +2275,7 @@ static void khugepaged_do_scan(void)
>                 if (khugepaged_has_work() &&
>                     pass_through_head < 2)
>                         progress += khugepaged_scan_mm_slot(pages - progress,
> -                                                           &hpage);
> +                                                           &hpage, cc);
>                 else
>                         progress = pages;
>                 spin_unlock(&khugepaged_mm_lock);
> @@ -2285,12 +2314,15 @@ static void khugepaged_wait_work(void)
>  static int khugepaged(void *none)
>  {
>         struct mm_slot *mm_slot;
> +       struct collapse_control cc;
> +
> +       collapse_control_init(&cc, /* enforce_pte_scan_limits= */ 1);
>
>         set_freezable();
>         set_user_nice(current, MAX_NICE);
>
>         while (!kthread_should_stop()) {
> -               khugepaged_do_scan();
> +               khugepaged_do_scan(&cc);
>                 khugepaged_wait_work();
>         }
>
> --
> 2.35.1.616.g0bdcbb4464-goog
>
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a4e5eaf3eb01..36fc0099c445 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -85,6 +85,24 @@  static struct kmem_cache *mm_slot_cache __read_mostly;
 
 #define MAX_PTE_MAPPED_THP 8
 
+struct collapse_control {
+	/* Respect khugepaged_max_ptes_[none|swap|shared] */
+	bool enforce_pte_scan_limits;
+
+	/* Num pages scanned per node */
+	int node_load[MAX_NUMNODES];
+
+	/* Last target selected in khugepaged_find_target_node() for this scan */
+	int last_target_node;
+};
+
+static void collapse_control_init(struct collapse_control *cc,
+				  bool enforce_pte_scan_limits)
+{
+	cc->enforce_pte_scan_limits = enforce_pte_scan_limits;
+	cc->last_target_node = NUMA_NO_NODE;
+}
+
 /**
  * struct mm_slot - hash lookup from mm to mm_slot
  * @hash: hash collision list
@@ -601,6 +619,7 @@  static bool is_refcount_suitable(struct page *page)
 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 					unsigned long address,
 					pte_t *pte,
+					bool enforce_pte_scan_limits,
 					struct list_head *compound_pagelist)
 {
 	struct page *page = NULL;
@@ -614,7 +633,8 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		if (pte_none(pteval) || (pte_present(pteval) &&
 				is_zero_pfn(pte_pfn(pteval)))) {
 			if (!userfaultfd_armed(vma) &&
-			    ++none_or_zero <= khugepaged_max_ptes_none) {
+			    (++none_or_zero <= khugepaged_max_ptes_none ||
+			     !enforce_pte_scan_limits)) {
 				continue;
 			} else {
 				result = SCAN_EXCEED_NONE_PTE;
@@ -634,8 +654,8 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
 		VM_BUG_ON_PAGE(!PageAnon(page), page);
 
-		if (page_mapcount(page) > 1 &&
-				++shared > khugepaged_max_ptes_shared) {
+		if (page_mapcount(page) > 1 && enforce_pte_scan_limits &&
+		    ++shared > khugepaged_max_ptes_shared) {
 			result = SCAN_EXCEED_SHARED_PTE;
 			count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
 			goto out;
@@ -785,9 +805,7 @@  static void khugepaged_alloc_sleep(void)
 	remove_wait_queue(&khugepaged_wait, &wait);
 }
 
-static int khugepaged_node_load[MAX_NUMNODES];
-
-static bool khugepaged_scan_abort(int nid)
+static bool khugepaged_scan_abort(int nid, struct collapse_control *cc)
 {
 	int i;
 
@@ -799,11 +817,11 @@  static bool khugepaged_scan_abort(int nid)
 		return false;
 
 	/* If there is a count for this node already, it must be acceptable */
-	if (khugepaged_node_load[nid])
+	if (cc->node_load[nid])
 		return false;
 
 	for (i = 0; i < MAX_NUMNODES; i++) {
-		if (!khugepaged_node_load[i])
+		if (!cc->node_load[i])
 			continue;
 		if (node_distance(nid, i) > node_reclaim_distance)
 			return true;
@@ -818,28 +836,28 @@  static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
 }
 
 #ifdef CONFIG_NUMA
-static int khugepaged_find_target_node(void)
+static int khugepaged_find_target_node(struct collapse_control *cc)
 {
-	static int last_khugepaged_target_node = NUMA_NO_NODE;
 	int nid, target_node = 0, max_value = 0;
 
 	/* find first node with max normal pages hit */
 	for (nid = 0; nid < MAX_NUMNODES; nid++)
-		if (khugepaged_node_load[nid] > max_value) {
-			max_value = khugepaged_node_load[nid];
+		if (cc->node_load[nid] > max_value) {
+			max_value = cc->node_load[nid];
 			target_node = nid;
 		}
 
 	/* do some balance if several nodes have the same hit record */
-	if (target_node <= last_khugepaged_target_node)
-		for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
-				nid++)
-			if (max_value == khugepaged_node_load[nid]) {
+	if (target_node <= cc->last_target_node)
+		for (nid = cc->last_target_node + 1; nid < MAX_NUMNODES;
+		     nid++) {
+			if (max_value == cc->node_load[nid]) {
 				target_node = nid;
 				break;
 			}
+		}
 
-	last_khugepaged_target_node = target_node;
+	cc->last_target_node = target_node;
 	return target_node;
 }
 
@@ -877,7 +895,7 @@  khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
 	return *hpage;
 }
 #else
-static int khugepaged_find_target_node(void)
+static int khugepaged_find_target_node(struct collapse_control *cc)
 {
 	return 0;
 }
@@ -1043,7 +1061,8 @@  static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 static void collapse_huge_page(struct mm_struct *mm,
 				   unsigned long address,
 				   struct page **hpage,
-				   int node, int referenced, int unmapped)
+				   int node, int referenced, int unmapped,
+				   int enforce_pte_scan_limits)
 {
 	LIST_HEAD(compound_pagelist);
 	pmd_t *pmd, _pmd;
@@ -1141,7 +1160,7 @@  static void collapse_huge_page(struct mm_struct *mm,
 
 	spin_lock(pte_ptl);
 	isolated = __collapse_huge_page_isolate(vma, address, pte,
-			&compound_pagelist);
+			enforce_pte_scan_limits, &compound_pagelist);
 	spin_unlock(pte_ptl);
 
 	if (unlikely(!isolated)) {
@@ -1206,7 +1225,8 @@  static void collapse_huge_page(struct mm_struct *mm,
 static int khugepaged_scan_pmd(struct mm_struct *mm,
 			       struct vm_area_struct *vma,
 			       unsigned long address,
-			       struct page **hpage)
+			       struct page **hpage,
+			       struct collapse_control *cc)
 {
 	pmd_t *pmd;
 	pte_t *pte, *_pte;
@@ -1226,13 +1246,14 @@  static int khugepaged_scan_pmd(struct mm_struct *mm,
 		goto out;
 	}
 
-	memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+	memset(cc->node_load, 0, sizeof(cc->node_load));
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 	for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
 	     _pte++, _address += PAGE_SIZE) {
 		pte_t pteval = *_pte;
 		if (is_swap_pte(pteval)) {
-			if (++unmapped <= khugepaged_max_ptes_swap) {
+			if (++unmapped <= khugepaged_max_ptes_swap ||
+			    !cc->enforce_pte_scan_limits) {
 				/*
 				 * Always be strict with uffd-wp
 				 * enabled swap entries.  Please see
@@ -1251,7 +1272,8 @@  static int khugepaged_scan_pmd(struct mm_struct *mm,
 		}
 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
 			if (!userfaultfd_armed(vma) &&
-			    ++none_or_zero <= khugepaged_max_ptes_none) {
+			    (++none_or_zero <= khugepaged_max_ptes_none ||
+			     !cc->enforce_pte_scan_limits)) {
 				continue;
 			} else {
 				result = SCAN_EXCEED_NONE_PTE;
@@ -1282,7 +1304,8 @@  static int khugepaged_scan_pmd(struct mm_struct *mm,
 		}
 
 		if (page_mapcount(page) > 1 &&
-				++shared > khugepaged_max_ptes_shared) {
+				++shared > khugepaged_max_ptes_shared &&
+				cc->enforce_pte_scan_limits) {
 			result = SCAN_EXCEED_SHARED_PTE;
 			count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
 			goto out_unmap;
@@ -1292,16 +1315,16 @@  static int khugepaged_scan_pmd(struct mm_struct *mm,
 
 		/*
 		 * Record which node the original page is from and save this
-		 * information to khugepaged_node_load[].
+		 * information to cc->node_load[].
 		 * Khugepaged will allocate hugepage from the node has the max
 		 * hit record.
 		 */
 		node = page_to_nid(page);
-		if (khugepaged_scan_abort(node)) {
+		if (khugepaged_scan_abort(node, cc)) {
 			result = SCAN_SCAN_ABORT;
 			goto out_unmap;
 		}
-		khugepaged_node_load[node]++;
+		cc->node_load[node]++;
 		if (!PageLRU(page)) {
 			result = SCAN_PAGE_LRU;
 			goto out_unmap;
@@ -1352,10 +1375,11 @@  static int khugepaged_scan_pmd(struct mm_struct *mm,
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
 	if (ret) {
-		node = khugepaged_find_target_node();
+		node = khugepaged_find_target_node(cc);
 		/* collapse_huge_page will return with the mmap_lock released */
 		collapse_huge_page(mm, address, hpage, node,
-				referenced, unmapped);
+				referenced, unmapped,
+				cc->enforce_pte_scan_limits);
 	}
 out:
 	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
@@ -1992,7 +2016,8 @@  static void collapse_file(struct mm_struct *mm,
 }
 
 static void khugepaged_scan_file(struct mm_struct *mm,
-		struct file *file, pgoff_t start, struct page **hpage)
+		struct file *file, pgoff_t start, struct page **hpage,
+		struct collapse_control *cc)
 {
 	struct page *page = NULL;
 	struct address_space *mapping = file->f_mapping;
@@ -2003,14 +2028,15 @@  static void khugepaged_scan_file(struct mm_struct *mm,
 
 	present = 0;
 	swap = 0;
-	memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
+	memset(cc->node_load, 0, sizeof(cc->node_load));
 	rcu_read_lock();
 	xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
 		if (xas_retry(&xas, page))
 			continue;
 
 		if (xa_is_value(page)) {
-			if (++swap > khugepaged_max_ptes_swap) {
+			if (cc->enforce_pte_scan_limits &&
+			    ++swap > khugepaged_max_ptes_swap) {
 				result = SCAN_EXCEED_SWAP_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
 				break;
@@ -2028,11 +2054,11 @@  static void khugepaged_scan_file(struct mm_struct *mm,
 		}
 
 		node = page_to_nid(page);
-		if (khugepaged_scan_abort(node)) {
+		if (khugepaged_scan_abort(node, cc)) {
 			result = SCAN_SCAN_ABORT;
 			break;
 		}
-		khugepaged_node_load[node]++;
+		cc->node_load[node]++;
 
 		if (!PageLRU(page)) {
 			result = SCAN_PAGE_LRU;
@@ -2061,11 +2087,12 @@  static void khugepaged_scan_file(struct mm_struct *mm,
 	rcu_read_unlock();
 
 	if (result == SCAN_SUCCEED) {
-		if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+		if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none &&
+		    cc->enforce_pte_scan_limits) {
 			result = SCAN_EXCEED_NONE_PTE;
 			count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
 		} else {
-			node = khugepaged_find_target_node();
+			node = khugepaged_find_target_node(cc);
 			collapse_file(mm, file, start, hpage, node);
 		}
 	}
@@ -2074,7 +2101,8 @@  static void khugepaged_scan_file(struct mm_struct *mm,
 }
 #else
 static void khugepaged_scan_file(struct mm_struct *mm,
-		struct file *file, pgoff_t start, struct page **hpage)
+		struct file *file, pgoff_t start, struct page **hpage,
+		struct collapse_control *cc)
 {
 	BUILD_BUG();
 }
@@ -2085,7 +2113,8 @@  static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
 #endif
 
 static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
-					    struct page **hpage)
+					    struct page **hpage,
+					    struct collapse_control *cc)
 	__releases(&khugepaged_mm_lock)
 	__acquires(&khugepaged_mm_lock)
 {
@@ -2161,12 +2190,12 @@  static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 
 				mmap_read_unlock(mm);
 				ret = 1;
-				khugepaged_scan_file(mm, file, pgoff, hpage);
+				khugepaged_scan_file(mm, file, pgoff, hpage, cc);
 				fput(file);
 			} else {
 				ret = khugepaged_scan_pmd(mm, vma,
 						khugepaged_scan.address,
-						hpage);
+						hpage, cc);
 			}
 			/* move to next address */
 			khugepaged_scan.address += HPAGE_PMD_SIZE;
@@ -2222,7 +2251,7 @@  static int khugepaged_wait_event(void)
 		kthread_should_stop();
 }
 
-static void khugepaged_do_scan(void)
+static void khugepaged_do_scan(struct collapse_control *cc)
 {
 	struct page *hpage = NULL;
 	unsigned int progress = 0, pass_through_head = 0;
@@ -2246,7 +2275,7 @@  static void khugepaged_do_scan(void)
 		if (khugepaged_has_work() &&
 		    pass_through_head < 2)
 			progress += khugepaged_scan_mm_slot(pages - progress,
-							    &hpage);
+							    &hpage, cc);
 		else
 			progress = pages;
 		spin_unlock(&khugepaged_mm_lock);
@@ -2285,12 +2314,15 @@  static void khugepaged_wait_work(void)
 static int khugepaged(void *none)
 {
 	struct mm_slot *mm_slot;
+	struct collapse_control cc;
+
+	collapse_control_init(&cc, /* enforce_pte_scan_limits= */ 1);
 
 	set_freezable();
 	set_user_nice(current, MAX_NICE);
 
 	while (!kthread_should_stop()) {
-		khugepaged_do_scan();
+		khugepaged_do_scan(&cc);
 		khugepaged_wait_work();
 	}