diff mbox series

[v3,07/12] mm/khugepaged: add flag to ignore khugepaged_max_ptes_*

Message ID 20220426144412.742113-8-zokeefe@google.com (mailing list archive)
State New
Headers show
Series mm: userspace hugepage collapse | expand

Commit Message

Zach O'Keefe April 26, 2022, 2:44 p.m. UTC
Add enforce_pte_scan_limits flag to struct collapse_control that allows
context to ignore sysfs-controlled knobs:
khugepaged_max_ptes_[none|swap|shared].  Set this flag in khugepaged
collapse context to preserve existing khugepaged behavior and unset the
flag in madvise collapse context since the user presumably has reason to
believe the collapse will be beneficial.

Signed-off-by: Zach O'Keefe <zokeefe@google.com>
---
 mm/khugepaged.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

Comments

Peter Xu April 27, 2022, 9:12 p.m. UTC | #1
On Tue, Apr 26, 2022 at 07:44:07AM -0700, Zach O'Keefe wrote:
> @@ -2365,6 +2375,7 @@ static int khugepaged(void *none)
>  {
>  	struct mm_slot *mm_slot;
>  	struct collapse_control cc = {
> +		.enforce_pte_scan_limits = true,
>  		.last_target_node = NUMA_NO_NODE,
>  		.gfp = &alloc_hugepage_khugepaged_gfpmask,
>  		.alloc_hpage = &khugepaged_alloc_page,
> @@ -2512,6 +2523,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
>  		     unsigned long start, unsigned long end)
>  {
>  	struct collapse_control cc = {
> +		.enforce_pte_scan_limits = false,
>  		.last_target_node = NUMA_NO_NODE,
>  		.hpage = NULL,
>  		.gfp = &alloc_hugepage_madvise_gfpmask,

This changes the semantics of the new madvise().  IMHO it'll be ideal if
this patch is moved before the introduction of MADV_COLLAPSE, so the new
madvise() will have a consistent behavior.
Zach O'Keefe April 29, 2022, 2:26 p.m. UTC | #2
On Wed, Apr 27, 2022 at 2:13 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Tue, Apr 26, 2022 at 07:44:07AM -0700, Zach O'Keefe wrote:
> > @@ -2365,6 +2375,7 @@ static int khugepaged(void *none)
> >  {
> >       struct mm_slot *mm_slot;
> >       struct collapse_control cc = {
> > +             .enforce_pte_scan_limits = true,
> >               .last_target_node = NUMA_NO_NODE,
> >               .gfp = &alloc_hugepage_khugepaged_gfpmask,
> >               .alloc_hpage = &khugepaged_alloc_page,
> > @@ -2512,6 +2523,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
> >                    unsigned long start, unsigned long end)
> >  {
> >       struct collapse_control cc = {
> > +             .enforce_pte_scan_limits = false,
> >               .last_target_node = NUMA_NO_NODE,
> >               .hpage = NULL,
> >               .gfp = &alloc_hugepage_madvise_gfpmask,
>
> This changes the semantics of the new madvise().  IMHO it'll be ideal if
> this patch is moved before the introduction of MADV_COLLAPSE, so the new
> madvise() will have a consistent behavior.
>

That makes sense to me, as this was how it was done in the RFC. I'll
assume this applies equally well to "mm/khugepaged: add flag to ignore
page young/referenced requirement" and move both before introducing
the new madvise(2) mode.

Thanks again Peter,
Zach

> --
> Peter Xu
>
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a6881f5b3c67..57725482290d 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -87,6 +87,9 @@  static struct kmem_cache *mm_slot_cache __read_mostly;
 #define MAX_PTE_MAPPED_THP 8
 
 struct collapse_control {
+	/* Respect khugepaged_max_ptes_[none|swap|shared] */
+	bool enforce_pte_scan_limits;
+
 	/* Num pages scanned per node */
 	int node_load[MAX_NUMNODES];
 
@@ -632,6 +635,7 @@  static bool is_refcount_suitable(struct page *page)
 static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 					unsigned long address,
 					pte_t *pte,
+					struct collapse_control *cc,
 					struct list_head *compound_pagelist)
 {
 	struct page *page = NULL;
@@ -645,7 +649,8 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		if (pte_none(pteval) || (pte_present(pteval) &&
 				is_zero_pfn(pte_pfn(pteval)))) {
 			if (!userfaultfd_armed(vma) &&
-			    ++none_or_zero <= khugepaged_max_ptes_none) {
+			    (++none_or_zero <= khugepaged_max_ptes_none ||
+			     !cc->enforce_pte_scan_limits)) {
 				continue;
 			} else {
 				result = SCAN_EXCEED_NONE_PTE;
@@ -665,8 +670,8 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 
 		VM_BUG_ON_PAGE(!PageAnon(page), page);
 
-		if (page_mapcount(page) > 1 &&
-				++shared > khugepaged_max_ptes_shared) {
+		if (cc->enforce_pte_scan_limits && page_mapcount(page) > 1 &&
+		    ++shared > khugepaged_max_ptes_shared) {
 			result = SCAN_EXCEED_SHARED_PTE;
 			count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
 			goto out;
@@ -1208,7 +1213,7 @@  static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	mmu_notifier_invalidate_range_end(&range);
 
 	spin_lock(pte_ptl);
-	cr->result =  __collapse_huge_page_isolate(vma, address, pte,
+	cr->result =  __collapse_huge_page_isolate(vma, address, pte, cc,
 						   &compound_pagelist);
 	spin_unlock(pte_ptl);
 
@@ -1297,7 +1302,8 @@  static void scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 	     _pte++, _address += PAGE_SIZE) {
 		pte_t pteval = *_pte;
 		if (is_swap_pte(pteval)) {
-			if (++unmapped <= khugepaged_max_ptes_swap) {
+			if (++unmapped <= khugepaged_max_ptes_swap ||
+			    !cc->enforce_pte_scan_limits) {
 				/*
 				 * Always be strict with uffd-wp
 				 * enabled swap entries.  Please see
@@ -1316,7 +1322,8 @@  static void scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 		if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
 			if (!userfaultfd_armed(vma) &&
-			    ++none_or_zero <= khugepaged_max_ptes_none) {
+			    (++none_or_zero <= khugepaged_max_ptes_none ||
+			     !cc->enforce_pte_scan_limits)) {
 				continue;
 			} else {
 				cr->result = SCAN_EXCEED_NONE_PTE;
@@ -1346,8 +1353,9 @@  static void scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 			goto out_unmap;
 		}
 
-		if (page_mapcount(page) > 1 &&
-				++shared > khugepaged_max_ptes_shared) {
+		if (cc->enforce_pte_scan_limits &&
+		    page_mapcount(page) > 1 &&
+		    ++shared > khugepaged_max_ptes_shared) {
 			cr->result = SCAN_EXCEED_SHARED_PTE;
 			count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
 			goto out_unmap;
@@ -2087,7 +2095,8 @@  static void khugepaged_scan_file(struct mm_struct *mm,
 			continue;
 
 		if (xa_is_value(page)) {
-			if (++swap > khugepaged_max_ptes_swap) {
+			if (cc->enforce_pte_scan_limits &&
+			    ++swap > khugepaged_max_ptes_swap) {
 				cr->result = SCAN_EXCEED_SWAP_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
 				break;
@@ -2138,7 +2147,8 @@  static void khugepaged_scan_file(struct mm_struct *mm,
 	rcu_read_unlock();
 
 	if (cr->result == SCAN_SUCCEED) {
-		if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+		if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none &&
+		    cc->enforce_pte_scan_limits) {
 			cr->result = SCAN_EXCEED_NONE_PTE;
 			count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
 		} else {
@@ -2365,6 +2375,7 @@  static int khugepaged(void *none)
 {
 	struct mm_slot *mm_slot;
 	struct collapse_control cc = {
+		.enforce_pte_scan_limits = true,
 		.last_target_node = NUMA_NO_NODE,
 		.gfp = &alloc_hugepage_khugepaged_gfpmask,
 		.alloc_hpage = &khugepaged_alloc_page,
@@ -2512,6 +2523,7 @@  int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
 		     unsigned long start, unsigned long end)
 {
 	struct collapse_control cc = {
+		.enforce_pte_scan_limits = false,
 		.last_target_node = NUMA_NO_NODE,
 		.hpage = NULL,
 		.gfp = &alloc_hugepage_madvise_gfpmask,