diff mbox series

[v6,07/15] mm/khugepaged: add flag to ignore khugepaged heuristics

Message ID 20220604004004.954674-8-zokeefe@google.com (mailing list archive)
State New
Headers show
Series mm: userspace hugepage collapse | expand

Commit Message

Zach O'Keefe June 4, 2022, 12:39 a.m. UTC
Add enforce_page_heuristics flag to struct collapse_control that allows
context to ignore heuristics originally designed to guide khugepaged:

1) sysfs-controlled knobs khugepaged_max_ptes_[none|swap|shared]
2) requirement that some pages in region being collapsed be young or
   referenced

This flag is set in khugepaged collapse context to preserve existing
khugepaged behavior.

This flag will be used (unset) when introducing madvise collapse
context since here, the user presumably has reason to believe the
collapse will be beneficial and khugepaged heuristics shouldn't tell
the user they are wrong.

Signed-off-by: Zach O'Keefe <zokeefe@google.com>
---
 mm/khugepaged.c | 55 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 37 insertions(+), 18 deletions(-)

Comments

Yang Shi June 6, 2022, 10:51 p.m. UTC | #1
On Fri, Jun 3, 2022 at 5:40 PM Zach O'Keefe <zokeefe@google.com> wrote:
>
> Add enforce_page_heuristics flag to struct collapse_control that allows
> context to ignore heuristics originally designed to guide khugepaged:
>
> 1) sysfs-controlled knobs khugepaged_max_ptes_[none|swap|shared]
> 2) requirement that some pages in region being collapsed be young or
>    referenced
>
> This flag is set in khugepaged collapse context to preserve existing
> khugepaged behavior.
>
> This flag will be used (unset) when introducing madvise collapse
> context since here, the user presumably has reason to believe the
> collapse will be beneficial and khugepaged heuristics shouldn't tell
> the user they are wrong.
>
> Signed-off-by: Zach O'Keefe <zokeefe@google.com>

Reviewed-by: Yang Shi <shy828301@gmail.com>

> ---
>  mm/khugepaged.c | 55 +++++++++++++++++++++++++++++++++----------------
>  1 file changed, 37 insertions(+), 18 deletions(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 03e0da0008f1..c3589b3e238d 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -87,6 +87,13 @@ static struct kmem_cache *mm_slot_cache __read_mostly;
>  #define MAX_PTE_MAPPED_THP 8
>
>  struct collapse_control {
> +       /*
> +        * Heuristics:
> +        * - khugepaged_max_ptes_[none|swap|shared]
> +        * - require memory to be young / referenced
> +        */
> +       bool enforce_page_heuristics;
> +
>         /* Num pages scanned per node */
>         int node_load[MAX_NUMNODES];
>
> @@ -604,6 +611,7 @@ static bool is_refcount_suitable(struct page *page)
>  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                                         unsigned long address,
>                                         pte_t *pte,
> +                                       struct collapse_control *cc,
>                                         struct list_head *compound_pagelist)
>  {
>         struct page *page = NULL;
> @@ -617,7 +625,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                 if (pte_none(pteval) || (pte_present(pteval) &&
>                                 is_zero_pfn(pte_pfn(pteval)))) {
>                         if (!userfaultfd_armed(vma) &&
> -                           ++none_or_zero <= khugepaged_max_ptes_none) {
> +                           (++none_or_zero <= khugepaged_max_ptes_none ||
> +                            !cc->enforce_page_heuristics)) {
>                                 continue;
>                         } else {
>                                 result = SCAN_EXCEED_NONE_PTE;
> @@ -637,8 +646,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>
>                 VM_BUG_ON_PAGE(!PageAnon(page), page);
>
> -               if (page_mapcount(page) > 1 &&
> -                               ++shared > khugepaged_max_ptes_shared) {
> +               if (cc->enforce_page_heuristics && page_mapcount(page) > 1 &&
> +                   ++shared > khugepaged_max_ptes_shared) {
>                         result = SCAN_EXCEED_SHARED_PTE;
>                         count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
>                         goto out;
> @@ -705,9 +714,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>                         list_add_tail(&page->lru, compound_pagelist);
>  next:
>                 /* There should be enough young pte to collapse the page */
> -               if (pte_young(pteval) ||
> -                   page_is_young(page) || PageReferenced(page) ||
> -                   mmu_notifier_test_young(vma->vm_mm, address))
> +               if (cc->enforce_page_heuristics &&
> +                   (pte_young(pteval) || page_is_young(page) ||
> +                    PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
> +                                                                    address)))
>                         referenced++;
>
>                 if (pte_write(pteval))
> @@ -716,7 +726,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>
>         if (unlikely(!writable)) {
>                 result = SCAN_PAGE_RO;
> -       } else if (unlikely(!referenced)) {
> +       } else if (unlikely(cc->enforce_page_heuristics && !referenced)) {
>                 result = SCAN_LACK_REFERENCED_PAGE;
>         } else {
>                 result = SCAN_SUCCEED;
> @@ -1096,7 +1106,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>         mmu_notifier_invalidate_range_end(&range);
>
>         spin_lock(pte_ptl);
> -       result =  __collapse_huge_page_isolate(vma, address, pte,
> +       result =  __collapse_huge_page_isolate(vma, address, pte, cc,
>                                                &compound_pagelist);
>         spin_unlock(pte_ptl);
>
> @@ -1185,7 +1195,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
>              _pte++, _address += PAGE_SIZE) {
>                 pte_t pteval = *_pte;
>                 if (is_swap_pte(pteval)) {
> -                       if (++unmapped <= khugepaged_max_ptes_swap) {
> +                       if (++unmapped <= khugepaged_max_ptes_swap ||
> +                           !cc->enforce_page_heuristics) {
>                                 /*
>                                  * Always be strict with uffd-wp
>                                  * enabled swap entries.  Please see
> @@ -1204,7 +1215,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
>                 }
>                 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
>                         if (!userfaultfd_armed(vma) &&
> -                           ++none_or_zero <= khugepaged_max_ptes_none) {
> +                           (++none_or_zero <= khugepaged_max_ptes_none ||
> +                            !cc->enforce_page_heuristics)) {
>                                 continue;
>                         } else {
>                                 result = SCAN_EXCEED_NONE_PTE;
> @@ -1234,8 +1246,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
>                         goto out_unmap;
>                 }
>
> -               if (page_mapcount(page) > 1 &&
> -                               ++shared > khugepaged_max_ptes_shared) {
> +               if (cc->enforce_page_heuristics &&
> +                   page_mapcount(page) > 1 &&
> +                   ++shared > khugepaged_max_ptes_shared) {
>                         result = SCAN_EXCEED_SHARED_PTE;
>                         count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
>                         goto out_unmap;
> @@ -1289,14 +1302,17 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
>                         result = SCAN_PAGE_COUNT;
>                         goto out_unmap;
>                 }
> -               if (pte_young(pteval) ||
> -                   page_is_young(page) || PageReferenced(page) ||
> -                   mmu_notifier_test_young(vma->vm_mm, address))
> +               if (cc->enforce_page_heuristics &&
> +                   (pte_young(pteval) || page_is_young(page) ||
> +                    PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
> +                                                                    address)))
>                         referenced++;
>         }
>         if (!writable) {
>                 result = SCAN_PAGE_RO;
> -       } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
> +       } else if (cc->enforce_page_heuristics &&
> +                  (!referenced ||
> +                   (unmapped && referenced < HPAGE_PMD_NR / 2))) {
>                 result = SCAN_LACK_REFERENCED_PAGE;
>         } else {
>                 result = SCAN_SUCCEED;
> @@ -1966,7 +1982,8 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file,
>                         continue;
>
>                 if (xa_is_value(page)) {
> -                       if (++swap > khugepaged_max_ptes_swap) {
> +                       if (cc->enforce_page_heuristics &&
> +                           ++swap > khugepaged_max_ptes_swap) {
>                                 result = SCAN_EXCEED_SWAP_PTE;
>                                 count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
>                                 break;
> @@ -2017,7 +2034,8 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file,
>         rcu_read_unlock();
>
>         if (result == SCAN_SUCCEED) {
> -               if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
> +               if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none &&
> +                   cc->enforce_page_heuristics) {
>                         result = SCAN_EXCEED_NONE_PTE;
>                         count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
>                 } else {
> @@ -2258,6 +2276,7 @@ static int khugepaged(void *none)
>  {
>         struct mm_slot *mm_slot;
>         struct collapse_control cc = {
> +               .enforce_page_heuristics = true,
>                 .last_target_node = NUMA_NO_NODE,
>                 /* .gfp set later  */
>         };
> --
> 2.36.1.255.ge46751e96f-goog
>
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 03e0da0008f1..c3589b3e238d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -87,6 +87,13 @@  static struct kmem_cache *mm_slot_cache __read_mostly;
 #define MAX_PTE_MAPPED_THP 8
 
 struct collapse_control {
+	/*
+	 * Heuristics:
+	 * - khugepaged_max_ptes_[none|swap|shared]
+	 * - require memory to be young / referenced
+	 */
+	bool enforce_page_heuristics;
+
 	/* Num pages scanned per node */
 	int node_load[MAX_NUMNODES];
 
@@ -604,6 +611,7 @@  static bool is_refcount_suitable(struct page *page)
 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 					unsigned long address,
 					pte_t *pte,
+					struct collapse_control *cc,
 					struct list_head *compound_pagelist)
 {
 	struct page *page = NULL;
@@ -617,7 +625,8 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		if (pte_none(pteval) || (pte_present(pteval) &&
 				is_zero_pfn(pte_pfn(pteval)))) {
 			if (!userfaultfd_armed(vma) &&
-			    ++none_or_zero <= khugepaged_max_ptes_none) {
+			    (++none_or_zero <= khugepaged_max_ptes_none ||
+			     !cc->enforce_page_heuristics)) {
 				continue;
 			} else {
 				result = SCAN_EXCEED_NONE_PTE;
@@ -637,8 +646,8 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
 		VM_BUG_ON_PAGE(!PageAnon(page), page);
 
-		if (page_mapcount(page) > 1 &&
-				++shared > khugepaged_max_ptes_shared) {
+		if (cc->enforce_page_heuristics && page_mapcount(page) > 1 &&
+		    ++shared > khugepaged_max_ptes_shared) {
 			result = SCAN_EXCEED_SHARED_PTE;
 			count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
 			goto out;
@@ -705,9 +714,10 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			list_add_tail(&page->lru, compound_pagelist);
 next:
 		/* There should be enough young pte to collapse the page */
-		if (pte_young(pteval) ||
-		    page_is_young(page) || PageReferenced(page) ||
-		    mmu_notifier_test_young(vma->vm_mm, address))
+		if (cc->enforce_page_heuristics &&
+		    (pte_young(pteval) || page_is_young(page) ||
+		     PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+								     address)))
 			referenced++;
 
 		if (pte_write(pteval))
@@ -716,7 +726,7 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
 	if (unlikely(!writable)) {
 		result = SCAN_PAGE_RO;
-	} else if (unlikely(!referenced)) {
+	} else if (unlikely(cc->enforce_page_heuristics && !referenced)) {
 		result = SCAN_LACK_REFERENCED_PAGE;
 	} else {
 		result = SCAN_SUCCEED;
@@ -1096,7 +1106,7 @@  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	mmu_notifier_invalidate_range_end(&range);
 
 	spin_lock(pte_ptl);
-	result =  __collapse_huge_page_isolate(vma, address, pte,
+	result =  __collapse_huge_page_isolate(vma, address, pte, cc,
 					       &compound_pagelist);
 	spin_unlock(pte_ptl);
 
@@ -1185,7 +1195,8 @@  static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 	     _pte++, _address += PAGE_SIZE) {
 		pte_t pteval = *_pte;
 		if (is_swap_pte(pteval)) {
-			if (++unmapped <= khugepaged_max_ptes_swap) {
+			if (++unmapped <= khugepaged_max_ptes_swap ||
+			    !cc->enforce_page_heuristics) {
 				/*
 				 * Always be strict with uffd-wp
 				 * enabled swap entries.  Please see
@@ -1204,7 +1215,8 @@  static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
 			if (!userfaultfd_armed(vma) &&
-			    ++none_or_zero <= khugepaged_max_ptes_none) {
+			    (++none_or_zero <= khugepaged_max_ptes_none ||
+			     !cc->enforce_page_heuristics)) {
 				continue;
 			} else {
 				result = SCAN_EXCEED_NONE_PTE;
@@ -1234,8 +1246,9 @@  static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 			goto out_unmap;
 		}
 
-		if (page_mapcount(page) > 1 &&
-				++shared > khugepaged_max_ptes_shared) {
+		if (cc->enforce_page_heuristics &&
+		    page_mapcount(page) > 1 &&
+		    ++shared > khugepaged_max_ptes_shared) {
 			result = SCAN_EXCEED_SHARED_PTE;
 			count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
 			goto out_unmap;
@@ -1289,14 +1302,17 @@  static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 			result = SCAN_PAGE_COUNT;
 			goto out_unmap;
 		}
-		if (pte_young(pteval) ||
-		    page_is_young(page) || PageReferenced(page) ||
-		    mmu_notifier_test_young(vma->vm_mm, address))
+		if (cc->enforce_page_heuristics &&
+		    (pte_young(pteval) || page_is_young(page) ||
+		     PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
+								     address)))
 			referenced++;
 	}
 	if (!writable) {
 		result = SCAN_PAGE_RO;
-	} else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
+	} else if (cc->enforce_page_heuristics &&
+		   (!referenced ||
+		    (unmapped && referenced < HPAGE_PMD_NR / 2))) {
 		result = SCAN_LACK_REFERENCED_PAGE;
 	} else {
 		result = SCAN_SUCCEED;
@@ -1966,7 +1982,8 @@  static int khugepaged_scan_file(struct mm_struct *mm, struct file *file,
 			continue;
 
 		if (xa_is_value(page)) {
-			if (++swap > khugepaged_max_ptes_swap) {
+			if (cc->enforce_page_heuristics &&
+			    ++swap > khugepaged_max_ptes_swap) {
 				result = SCAN_EXCEED_SWAP_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
 				break;
@@ -2017,7 +2034,8 @@  static int khugepaged_scan_file(struct mm_struct *mm, struct file *file,
 	rcu_read_unlock();
 
 	if (result == SCAN_SUCCEED) {
-		if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+		if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none &&
+		    cc->enforce_page_heuristics) {
 			result = SCAN_EXCEED_NONE_PTE;
 			count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
 		} else {
@@ -2258,6 +2276,7 @@  static int khugepaged(void *none)
 {
 	struct mm_slot *mm_slot;
 	struct collapse_control cc = {
+		.enforce_page_heuristics = true,
 		.last_target_node = NUMA_NO_NODE,
 		/* .gfp set later  */
 	};