diff mbox series

[v3,03/12] mm/khugepaged: make hugepage allocation context-specific

Message ID 20220426144412.742113-4-zokeefe@google.com (mailing list archive)
State New
Headers show
Series mm: userspace hugepage collapse | expand

Commit Message

Zach O'Keefe April 26, 2022, 2:44 p.m. UTC
Add hugepage allocation context to struct collapse_context, allowing
different collapse contexts to allocate hugepages differently.  For
example, khugepaged decides to allocate differently in NUMA and UMA
configurations, and other collapse contexts shouldn't be coupled to this
decision.  Likewise for the gfp flags used for said allocation.

Additionally, move [pre]allocated hugepage pointer into
struct collapse_context.

Signed-off-by: Zach O'Keefe <zokeefe@google.com>
Reported-by: kernel test robot <lkp@intel.com>
---
 mm/khugepaged.c | 102 ++++++++++++++++++++++++------------------------
 1 file changed, 52 insertions(+), 50 deletions(-)

Comments

Peter Xu April 27, 2022, 8:04 p.m. UTC | #1
On Tue, Apr 26, 2022 at 07:44:03AM -0700, Zach O'Keefe wrote:
> Add hugepage allocation context to struct collapse_context, allowing
> different collapse contexts to allocate hugepages differently.  For
> example, khugepaged decides to allocate differently in NUMA and UMA
> configurations, and other collapse contexts shouldn't be coupled to this
> decision.  Likewise for the gfp flags used for said allocation.
> 
> Additionally, move [pre]allocated hugepage pointer into
> struct collapse_context.
> 
> Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> Reported-by: kernel test robot <lkp@intel.com>

(Please remember to remove this one too, and the rest ones.. :)

> ---
>  mm/khugepaged.c | 102 ++++++++++++++++++++++++------------------------
>  1 file changed, 52 insertions(+), 50 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 9d42fa330812..c4962191d6e1 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -92,6 +92,11 @@ struct collapse_control {
>  
>  	/* Last target selected in khugepaged_find_target_node() for this scan */
>  	int last_target_node;
> +
> +	struct page *hpage;
> +	gfp_t (*gfp)(void);
> +	struct page* (*alloc_hpage)(struct collapse_control *cc, gfp_t gfp,
> +				    int node);

Nit: s/page* /page */ looks a bit more consistent..

>  };
>  
>  /**
> @@ -877,21 +882,21 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
>  	return true;
>  }
>  
> -static struct page *
> -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> +static struct page *khugepaged_alloc_page(struct collapse_control *cc,
> +					  gfp_t gfp, int node)

Nit: I'd suggest we put collapse_control* either at the start or at the
end, and keep doing it when possible.

IIRC you used to put it always at the end, but now it's at the start.  Not
a big deal but it'll be nice to keep the code self-aligned.

>  {
> -	VM_BUG_ON_PAGE(*hpage, *hpage);
> +	VM_BUG_ON_PAGE(cc->hpage, cc->hpage);
>  
> -	*hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
> -	if (unlikely(!*hpage)) {
> +	cc->hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
> +	if (unlikely(!cc->hpage)) {
>  		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> -		*hpage = ERR_PTR(-ENOMEM);
> +		cc->hpage = ERR_PTR(-ENOMEM);
>  		return NULL;
>  	}
>  
> -	prep_transhuge_page(*hpage);
> +	prep_transhuge_page(cc->hpage);
>  	count_vm_event(THP_COLLAPSE_ALLOC);
> -	return *hpage;
> +	return cc->hpage;
>  }
>  #else
>  static int khugepaged_find_target_node(struct collapse_control *cc)
> @@ -953,12 +958,12 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
>  	return true;
>  }
>  
> -static struct page *
> -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> +static struct page *khugepaged_alloc_page(struct collapse_control *cc,
> +					  gfp_t gfp, int node)
>  {
> -	VM_BUG_ON(!*hpage);
> +	VM_BUG_ON(!cc->hpage);
>  
> -	return  *hpage;
> +	return cc->hpage;
>  }
>  #endif
>  
> @@ -1080,10 +1085,9 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
>  	return true;
>  }
>  
> -static void collapse_huge_page(struct mm_struct *mm,
> -				   unsigned long address,
> -				   struct page **hpage,
> -				   int node, int referenced, int unmapped)
> +static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
> +			       struct collapse_control *cc, int referenced,
> +			       int unmapped)
>  {
>  	LIST_HEAD(compound_pagelist);
>  	pmd_t *pmd, _pmd;
> @@ -1096,11 +1100,12 @@ static void collapse_huge_page(struct mm_struct *mm,
>  	struct mmu_notifier_range range;
>  	gfp_t gfp;
>  	const struct cpumask *cpumask;
> +	int node;
>  
>  	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>  
>  	/* Only allocate from the target node */
> -	gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
> +	gfp = cc->gfp() | __GFP_THISNODE;
>  
>  	/*
>  	 * Before allocating the hugepage, release the mmap_lock read lock.
> @@ -1110,13 +1115,14 @@ static void collapse_huge_page(struct mm_struct *mm,
>  	 */
>  	mmap_read_unlock(mm);
>  
> +	node = khugepaged_find_target_node(cc);
>  	/* sched to specified node before huage page memory copy */
>  	if (task_node(current) != node) {
>  		cpumask = cpumask_of_node(node);
>  		if (!cpumask_empty(cpumask))
>  			set_cpus_allowed_ptr(current, cpumask);
>  	}
> -	new_page = khugepaged_alloc_page(hpage, gfp, node);
> +	new_page = cc->alloc_hpage(cc, gfp, node);

AFAICT you removed all references of khugepaged_alloc_page() in this patch,
then you'd better drop the function for both NUMA and UMA.

Said that, I think it's actually better to keep them, as they do things
useful.  For example,AFAICT this ->alloc_hpage() change can leak the hpage
alloacted for UMA already so that's why I think keeping them makes sense,
then iiuc the BUG_ON would trigger with UMA already.

I saw that you've moved khugepaged_find_target_node() into this function
which looks definitely good, but if we could keep khugepaged_alloc_page()
and then IMHO we could even move khugepaged_find_target_node() into that
and drop the "node" parameter in khugepaged_alloc_page().

I'd even consider moving cc->gfp() all into it if possible, since the gfp
seems to be always with __GFP_THISNODE anyways.

What do you think?

>  	if (!new_page) {
>  		result = SCAN_ALLOC_HUGE_PAGE_FAIL;
>  		goto out_nolock;
> @@ -1238,15 +1244,15 @@ static void collapse_huge_page(struct mm_struct *mm,
>  	update_mmu_cache_pmd(vma, address, pmd);
>  	spin_unlock(pmd_ptl);
>  
> -	*hpage = NULL;
> +	cc->hpage = NULL;
>  
>  	khugepaged_pages_collapsed++;
>  	result = SCAN_SUCCEED;
>  out_up_write:
>  	mmap_write_unlock(mm);
>  out_nolock:
> -	if (!IS_ERR_OR_NULL(*hpage))
> -		mem_cgroup_uncharge(page_folio(*hpage));
> +	if (!IS_ERR_OR_NULL(cc->hpage))
> +		mem_cgroup_uncharge(page_folio(cc->hpage));
>  	trace_mm_collapse_huge_page(mm, isolated, result);
>  	return;
>  }
> @@ -1254,7 +1260,6 @@ static void collapse_huge_page(struct mm_struct *mm,
>  static int khugepaged_scan_pmd(struct mm_struct *mm,
>  			       struct vm_area_struct *vma,
>  			       unsigned long address,
> -			       struct page **hpage,
>  			       struct collapse_control *cc)
>  {
>  	pmd_t *pmd;
> @@ -1399,10 +1404,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
>  out_unmap:
>  	pte_unmap_unlock(pte, ptl);
>  	if (ret) {
> -		node = khugepaged_find_target_node(cc);
>  		/* collapse_huge_page will return with the mmap_lock released */
> -		collapse_huge_page(mm, address, hpage, node,
> -				referenced, unmapped);
> +		collapse_huge_page(mm, address, cc, referenced, unmapped);
>  	}
>  out:
>  	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
> @@ -1667,8 +1670,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>   * @mm: process address space where collapse happens
>   * @file: file that collapse on
>   * @start: collapse start address
> - * @hpage: new allocated huge page for collapse
> - * @node: appointed node the new huge page allocate from
> + * @cc: collapse context and scratchpad
>   *
>   * Basic scheme is simple, details are more complex:
>   *  - allocate and lock a new huge page;
> @@ -1686,8 +1688,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>   *    + unlock and free huge page;
>   */
>  static void collapse_file(struct mm_struct *mm,
> -		struct file *file, pgoff_t start,
> -		struct page **hpage, int node)
> +			  struct file *file, pgoff_t start,
> +			  struct collapse_control *cc)
>  {
>  	struct address_space *mapping = file->f_mapping;
>  	gfp_t gfp;
> @@ -1697,15 +1699,16 @@ static void collapse_file(struct mm_struct *mm,
>  	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
>  	int nr_none = 0, result = SCAN_SUCCEED;
>  	bool is_shmem = shmem_file(file);
> -	int nr;
> +	int nr, node;
>  
>  	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
>  	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
>  
>  	/* Only allocate from the target node */
> -	gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
> +	gfp = cc->gfp() | __GFP_THISNODE;
> +	node = khugepaged_find_target_node(cc);
>  
> -	new_page = khugepaged_alloc_page(hpage, gfp, node);
> +	new_page = cc->alloc_hpage(cc, gfp, node);
>  	if (!new_page) {
>  		result = SCAN_ALLOC_HUGE_PAGE_FAIL;
>  		goto out;
> @@ -1998,7 +2001,7 @@ static void collapse_file(struct mm_struct *mm,
>  		 * Remove pte page tables, so we can re-fault the page as huge.
>  		 */
>  		retract_page_tables(mapping, start);
> -		*hpage = NULL;
> +		cc->hpage = NULL;
>  
>  		khugepaged_pages_collapsed++;
>  	} else {
> @@ -2045,14 +2048,14 @@ static void collapse_file(struct mm_struct *mm,
>  	unlock_page(new_page);
>  out:
>  	VM_BUG_ON(!list_empty(&pagelist));
> -	if (!IS_ERR_OR_NULL(*hpage))
> -		mem_cgroup_uncharge(page_folio(*hpage));
> +	if (!IS_ERR_OR_NULL(cc->hpage))
> +		mem_cgroup_uncharge(page_folio(cc->hpage));
>  	/* TODO: tracepoints */
>  }
>  
>  static void khugepaged_scan_file(struct mm_struct *mm,
> -		struct file *file, pgoff_t start, struct page **hpage,
> -		struct collapse_control *cc)
> +				 struct file *file, pgoff_t start,
> +				 struct collapse_control *cc)
>  {
>  	struct page *page = NULL;
>  	struct address_space *mapping = file->f_mapping;
> @@ -2125,8 +2128,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
>  			result = SCAN_EXCEED_NONE_PTE;
>  			count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
>  		} else {
> -			node = khugepaged_find_target_node(cc);
> -			collapse_file(mm, file, start, hpage, node);
> +			collapse_file(mm, file, start, cc);
>  		}
>  	}
>  
> @@ -2134,8 +2136,8 @@ static void khugepaged_scan_file(struct mm_struct *mm,
>  }
>  #else
>  static void khugepaged_scan_file(struct mm_struct *mm,
> -		struct file *file, pgoff_t start, struct page **hpage,
> -		struct collapse_control *cc)
> +				 struct file *file, pgoff_t start,
> +				 struct collapse_control *cc)
>  {
>  	BUILD_BUG();
>  }
> @@ -2146,7 +2148,6 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
>  #endif
>  
>  static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
> -					    struct page **hpage,
>  					    struct collapse_control *cc)
>  	__releases(&khugepaged_mm_lock)
>  	__acquires(&khugepaged_mm_lock)
> @@ -2223,12 +2224,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
>  
>  				mmap_read_unlock(mm);
>  				ret = 1;
> -				khugepaged_scan_file(mm, file, pgoff, hpage, cc);
> +				khugepaged_scan_file(mm, file, pgoff, cc);
>  				fput(file);
>  			} else {
>  				ret = khugepaged_scan_pmd(mm, vma,
> -						khugepaged_scan.address,
> -						hpage, cc);
> +						khugepaged_scan.address, cc);
>  			}
>  			/* move to next address */
>  			khugepaged_scan.address += HPAGE_PMD_SIZE;
> @@ -2286,15 +2286,15 @@ static int khugepaged_wait_event(void)
>  
>  static void khugepaged_do_scan(struct collapse_control *cc)
>  {
> -	struct page *hpage = NULL;
>  	unsigned int progress = 0, pass_through_head = 0;
>  	unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
>  	bool wait = true;
>  
> +	cc->hpage = NULL;
>  	lru_add_drain_all();
>  
>  	while (progress < pages) {
> -		if (!khugepaged_prealloc_page(&hpage, &wait))
> +		if (!khugepaged_prealloc_page(&cc->hpage, &wait))
>  			break;
>  
>  		cond_resched();
> @@ -2308,14 +2308,14 @@ static void khugepaged_do_scan(struct collapse_control *cc)
>  		if (khugepaged_has_work() &&
>  		    pass_through_head < 2)
>  			progress += khugepaged_scan_mm_slot(pages - progress,
> -							    &hpage, cc);
> +							    cc);
>  		else
>  			progress = pages;
>  		spin_unlock(&khugepaged_mm_lock);
>  	}
>  
> -	if (!IS_ERR_OR_NULL(hpage))
> -		put_page(hpage);
> +	if (!IS_ERR_OR_NULL(cc->hpage))
> +		put_page(cc->hpage);
>  }
>  
>  static bool khugepaged_should_wakeup(void)
> @@ -2349,6 +2349,8 @@ static int khugepaged(void *none)
>  	struct mm_slot *mm_slot;
>  	struct collapse_control cc = {
>  		.last_target_node = NUMA_NO_NODE,
> +		.gfp = &alloc_hugepage_khugepaged_gfpmask,
> +		.alloc_hpage = &khugepaged_alloc_page,

I'm also wondering whether we could avoid the gfp() hook, because logically
that can be inlined into the ->alloc_hpage() hook, then we don't need gfp
parameter anymore for ->alloc_hpage(), which seems to be a win-win?
Zach O'Keefe April 28, 2022, 12:51 a.m. UTC | #2
On Wed, Apr 27, 2022 at 1:05 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Tue, Apr 26, 2022 at 07:44:03AM -0700, Zach O'Keefe wrote:
> > Add hugepage allocation context to struct collapse_context, allowing
> > different collapse contexts to allocate hugepages differently.  For
> > example, khugepaged decides to allocate differently in NUMA and UMA
> > configurations, and other collapse contexts shouldn't be coupled to this
> > decision.  Likewise for the gfp flags used for said allocation.
> >
> > Additionally, move [pre]allocated hugepage pointer into
> > struct collapse_context.
> >
> > Signed-off-by: Zach O'Keefe <zokeefe@google.com>
> > Reported-by: kernel test robot <lkp@intel.com>
>
> (Please remember to remove this one too, and the rest ones.. :)
>

Done :) Thanks

> > ---
> >  mm/khugepaged.c | 102 ++++++++++++++++++++++++------------------------
> >  1 file changed, 52 insertions(+), 50 deletions(-)
> >
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 9d42fa330812..c4962191d6e1 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -92,6 +92,11 @@ struct collapse_control {
> >
> >       /* Last target selected in khugepaged_find_target_node() for this scan */
> >       int last_target_node;
> > +
> > +     struct page *hpage;
> > +     gfp_t (*gfp)(void);
> > +     struct page* (*alloc_hpage)(struct collapse_control *cc, gfp_t gfp,
> > +                                 int node);
>
> Nit: s/page* /page */ looks a bit more consistent..
>

Done - thanks for catching.

> >  };
> >
> >  /**
> > @@ -877,21 +882,21 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
> >       return true;
> >  }
> >
> > -static struct page *
> > -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> > +static struct page *khugepaged_alloc_page(struct collapse_control *cc,
> > +                                       gfp_t gfp, int node)
>
> Nit: I'd suggest we put collapse_control* either at the start or at the
> end, and keep doing it when possible.
>
> IIRC you used to put it always at the end, but now it's at the start.  Not
> a big deal but it'll be nice to keep the code self-aligned.
>

Makes sense to me to be consistent. I'll move it to the end here and
look over the other patches to make an effort to do the same. General
strategy was to keep "output" arguments at the very end.

> >  {
> > -     VM_BUG_ON_PAGE(*hpage, *hpage);
> > +     VM_BUG_ON_PAGE(cc->hpage, cc->hpage);
> >
> > -     *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
> > -     if (unlikely(!*hpage)) {
> > +     cc->hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
> > +     if (unlikely(!cc->hpage)) {
> >               count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> > -             *hpage = ERR_PTR(-ENOMEM);
> > +             cc->hpage = ERR_PTR(-ENOMEM);
> >               return NULL;
> >       }
> >
> > -     prep_transhuge_page(*hpage);
> > +     prep_transhuge_page(cc->hpage);
> >       count_vm_event(THP_COLLAPSE_ALLOC);
> > -     return *hpage;
> > +     return cc->hpage;
> >  }
> >  #else
> >  static int khugepaged_find_target_node(struct collapse_control *cc)
> > @@ -953,12 +958,12 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
> >       return true;
> >  }
> >
> > -static struct page *
> > -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
> > +static struct page *khugepaged_alloc_page(struct collapse_control *cc,
> > +                                       gfp_t gfp, int node)
> >  {
> > -     VM_BUG_ON(!*hpage);
> > +     VM_BUG_ON(!cc->hpage);
> >
> > -     return  *hpage;
> > +     return cc->hpage;
> >  }
> >  #endif
> >
> > @@ -1080,10 +1085,9 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
> >       return true;
> >  }
> >
> > -static void collapse_huge_page(struct mm_struct *mm,
> > -                                unsigned long address,
> > -                                struct page **hpage,
> > -                                int node, int referenced, int unmapped)
> > +static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
> > +                            struct collapse_control *cc, int referenced,
> > +                            int unmapped)
> >  {
> >       LIST_HEAD(compound_pagelist);
> >       pmd_t *pmd, _pmd;
> > @@ -1096,11 +1100,12 @@ static void collapse_huge_page(struct mm_struct *mm,
> >       struct mmu_notifier_range range;
> >       gfp_t gfp;
> >       const struct cpumask *cpumask;
> > +     int node;
> >
> >       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> >
> >       /* Only allocate from the target node */
> > -     gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
> > +     gfp = cc->gfp() | __GFP_THISNODE;
> >
> >       /*
> >        * Before allocating the hugepage, release the mmap_lock read lock.
> > @@ -1110,13 +1115,14 @@ static void collapse_huge_page(struct mm_struct *mm,
> >        */
> >       mmap_read_unlock(mm);
> >
> > +     node = khugepaged_find_target_node(cc);
> >       /* sched to specified node before huage page memory copy */
> >       if (task_node(current) != node) {
> >               cpumask = cpumask_of_node(node);
> >               if (!cpumask_empty(cpumask))
> >                       set_cpus_allowed_ptr(current, cpumask);
> >       }
> > -     new_page = khugepaged_alloc_page(hpage, gfp, node);
> > +     new_page = cc->alloc_hpage(cc, gfp, node);
>
> AFAICT you removed all references of khugepaged_alloc_page() in this patch,
> then you'd better drop the function for both NUMA and UMA.
>

Sorry, I'm not sure I follow. In khugepaged context, logic WRT
khugepaged_alloc_page() is unchanged - it's just called indirectly
through ->alloc_hpage().

> Said that, I think it's actually better to keep them, as they do things
> useful.  For example,AFAICT this ->alloc_hpage() change can leak the hpage
> alloacted for UMA already so that's why I think keeping them makes sense,
> then iiuc the BUG_ON would trigger with UMA already.
>
> I saw that you've moved khugepaged_find_target_node() into this function
> which looks definitely good, but if we could keep khugepaged_alloc_page()
> and then IMHO we could even move khugepaged_find_target_node() into that
> and drop the "node" parameter in khugepaged_alloc_page().
>

I actually had done this, until commit 4272063db38c ("mm/khugepaged:
sched to numa node when collapse huge page") which forced me to keep
"node" visible in this function.

> I'd even consider moving cc->gfp() all into it if possible, since the gfp
> seems to be always with __GFP_THISNODE anyways.
>

I would have liked to do this, but the gfp flags are referenced again
when calling mem_cgroup_charge(), so I couldn't quite get rid of them
from here.

> What do you think?
>
> >       if (!new_page) {
> >               result = SCAN_ALLOC_HUGE_PAGE_FAIL;
> >               goto out_nolock;
> > @@ -1238,15 +1244,15 @@ static void collapse_huge_page(struct mm_struct *mm,
> >       update_mmu_cache_pmd(vma, address, pmd);
> >       spin_unlock(pmd_ptl);
> >
> > -     *hpage = NULL;
> > +     cc->hpage = NULL;
> >
> >       khugepaged_pages_collapsed++;
> >       result = SCAN_SUCCEED;
> >  out_up_write:
> >       mmap_write_unlock(mm);
> >  out_nolock:
> > -     if (!IS_ERR_OR_NULL(*hpage))
> > -             mem_cgroup_uncharge(page_folio(*hpage));
> > +     if (!IS_ERR_OR_NULL(cc->hpage))
> > +             mem_cgroup_uncharge(page_folio(cc->hpage));
> >       trace_mm_collapse_huge_page(mm, isolated, result);
> >       return;
> >  }
> > @@ -1254,7 +1260,6 @@ static void collapse_huge_page(struct mm_struct *mm,
> >  static int khugepaged_scan_pmd(struct mm_struct *mm,
> >                              struct vm_area_struct *vma,
> >                              unsigned long address,
> > -                            struct page **hpage,
> >                              struct collapse_control *cc)
> >  {
> >       pmd_t *pmd;
> > @@ -1399,10 +1404,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
> >  out_unmap:
> >       pte_unmap_unlock(pte, ptl);
> >       if (ret) {
> > -             node = khugepaged_find_target_node(cc);
> >               /* collapse_huge_page will return with the mmap_lock released */
> > -             collapse_huge_page(mm, address, hpage, node,
> > -                             referenced, unmapped);
> > +             collapse_huge_page(mm, address, cc, referenced, unmapped);
> >       }
> >  out:
> >       trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
> > @@ -1667,8 +1670,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
> >   * @mm: process address space where collapse happens
> >   * @file: file that collapse on
> >   * @start: collapse start address
> > - * @hpage: new allocated huge page for collapse
> > - * @node: appointed node the new huge page allocate from
> > + * @cc: collapse context and scratchpad
> >   *
> >   * Basic scheme is simple, details are more complex:
> >   *  - allocate and lock a new huge page;
> > @@ -1686,8 +1688,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
> >   *    + unlock and free huge page;
> >   */
> >  static void collapse_file(struct mm_struct *mm,
> > -             struct file *file, pgoff_t start,
> > -             struct page **hpage, int node)
> > +                       struct file *file, pgoff_t start,
> > +                       struct collapse_control *cc)
> >  {
> >       struct address_space *mapping = file->f_mapping;
> >       gfp_t gfp;
> > @@ -1697,15 +1699,16 @@ static void collapse_file(struct mm_struct *mm,
> >       XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
> >       int nr_none = 0, result = SCAN_SUCCEED;
> >       bool is_shmem = shmem_file(file);
> > -     int nr;
> > +     int nr, node;
> >
> >       VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
> >       VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
> >
> >       /* Only allocate from the target node */
> > -     gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
> > +     gfp = cc->gfp() | __GFP_THISNODE;
> > +     node = khugepaged_find_target_node(cc);
> >
> > -     new_page = khugepaged_alloc_page(hpage, gfp, node);
> > +     new_page = cc->alloc_hpage(cc, gfp, node);
> >       if (!new_page) {
> >               result = SCAN_ALLOC_HUGE_PAGE_FAIL;
> >               goto out;
> > @@ -1998,7 +2001,7 @@ static void collapse_file(struct mm_struct *mm,
> >                * Remove pte page tables, so we can re-fault the page as huge.
> >                */
> >               retract_page_tables(mapping, start);
> > -             *hpage = NULL;
> > +             cc->hpage = NULL;
> >
> >               khugepaged_pages_collapsed++;
> >       } else {
> > @@ -2045,14 +2048,14 @@ static void collapse_file(struct mm_struct *mm,
> >       unlock_page(new_page);
> >  out:
> >       VM_BUG_ON(!list_empty(&pagelist));
> > -     if (!IS_ERR_OR_NULL(*hpage))
> > -             mem_cgroup_uncharge(page_folio(*hpage));
> > +     if (!IS_ERR_OR_NULL(cc->hpage))
> > +             mem_cgroup_uncharge(page_folio(cc->hpage));
> >       /* TODO: tracepoints */
> >  }
> >
> >  static void khugepaged_scan_file(struct mm_struct *mm,
> > -             struct file *file, pgoff_t start, struct page **hpage,
> > -             struct collapse_control *cc)
> > +                              struct file *file, pgoff_t start,
> > +                              struct collapse_control *cc)
> >  {
> >       struct page *page = NULL;
> >       struct address_space *mapping = file->f_mapping;
> > @@ -2125,8 +2128,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
> >                       result = SCAN_EXCEED_NONE_PTE;
> >                       count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
> >               } else {
> > -                     node = khugepaged_find_target_node(cc);
> > -                     collapse_file(mm, file, start, hpage, node);
> > +                     collapse_file(mm, file, start, cc);
> >               }
> >       }
> >
> > @@ -2134,8 +2136,8 @@ static void khugepaged_scan_file(struct mm_struct *mm,
> >  }
> >  #else
> >  static void khugepaged_scan_file(struct mm_struct *mm,
> > -             struct file *file, pgoff_t start, struct page **hpage,
> > -             struct collapse_control *cc)
> > +                              struct file *file, pgoff_t start,
> > +                              struct collapse_control *cc)
> >  {
> >       BUILD_BUG();
> >  }
> > @@ -2146,7 +2148,6 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
> >  #endif
> >
> >  static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
> > -                                         struct page **hpage,
> >                                           struct collapse_control *cc)
> >       __releases(&khugepaged_mm_lock)
> >       __acquires(&khugepaged_mm_lock)
> > @@ -2223,12 +2224,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
> >
> >                               mmap_read_unlock(mm);
> >                               ret = 1;
> > -                             khugepaged_scan_file(mm, file, pgoff, hpage, cc);
> > +                             khugepaged_scan_file(mm, file, pgoff, cc);
> >                               fput(file);
> >                       } else {
> >                               ret = khugepaged_scan_pmd(mm, vma,
> > -                                             khugepaged_scan.address,
> > -                                             hpage, cc);
> > +                                             khugepaged_scan.address, cc);
> >                       }
> >                       /* move to next address */
> >                       khugepaged_scan.address += HPAGE_PMD_SIZE;
> > @@ -2286,15 +2286,15 @@ static int khugepaged_wait_event(void)
> >
> >  static void khugepaged_do_scan(struct collapse_control *cc)
> >  {
> > -     struct page *hpage = NULL;
> >       unsigned int progress = 0, pass_through_head = 0;
> >       unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
> >       bool wait = true;
> >
> > +     cc->hpage = NULL;
> >       lru_add_drain_all();
> >
> >       while (progress < pages) {
> > -             if (!khugepaged_prealloc_page(&hpage, &wait))
> > +             if (!khugepaged_prealloc_page(&cc->hpage, &wait))
> >                       break;
> >
> >               cond_resched();
> > @@ -2308,14 +2308,14 @@ static void khugepaged_do_scan(struct collapse_control *cc)
> >               if (khugepaged_has_work() &&
> >                   pass_through_head < 2)
> >                       progress += khugepaged_scan_mm_slot(pages - progress,
> > -                                                         &hpage, cc);
> > +                                                         cc);
> >               else
> >                       progress = pages;
> >               spin_unlock(&khugepaged_mm_lock);
> >       }
> >
> > -     if (!IS_ERR_OR_NULL(hpage))
> > -             put_page(hpage);
> > +     if (!IS_ERR_OR_NULL(cc->hpage))
> > +             put_page(cc->hpage);
> >  }
> >
> >  static bool khugepaged_should_wakeup(void)
> > @@ -2349,6 +2349,8 @@ static int khugepaged(void *none)
> >       struct mm_slot *mm_slot;
> >       struct collapse_control cc = {
> >               .last_target_node = NUMA_NO_NODE,
> > +             .gfp = &alloc_hugepage_khugepaged_gfpmask,
> > +             .alloc_hpage = &khugepaged_alloc_page,
>
> I'm also wondering whether we could avoid the gfp() hook, because logically
> that can be inlined into the ->alloc_hpage() hook, then we don't need gfp
> parameter anymore for ->alloc_hpage(), which seems to be a win-win?
>

As mentioned above, I would have liked to not have this, but the gfp
flags are used in a couple places (mem_cgroup_charge() and
->alloc_hpage()).

I could (?) make .gfp of type gfp_t and just update it on every
khugepaged scan (in case it changed) and also remove the gfp parameter
for ->alloc_hpage().

Thank you, Peter!

Zach

> --
> Peter Xu
>
Peter Xu April 28, 2022, 2:51 p.m. UTC | #3
On Wed, Apr 27, 2022 at 05:51:53PM -0700, Zach O'Keefe wrote:
> > > @@ -1110,13 +1115,14 @@ static void collapse_huge_page(struct mm_struct *mm,
> > >        */
> > >       mmap_read_unlock(mm);
> > >
> > > +     node = khugepaged_find_target_node(cc);
> > >       /* sched to specified node before huage page memory copy */
> > >       if (task_node(current) != node) {
> > >               cpumask = cpumask_of_node(node);
> > >               if (!cpumask_empty(cpumask))
> > >                       set_cpus_allowed_ptr(current, cpumask);
> > >       }
> > > -     new_page = khugepaged_alloc_page(hpage, gfp, node);
> > > +     new_page = cc->alloc_hpage(cc, gfp, node);
> >
> > AFAICT you removed all references of khugepaged_alloc_page() in this patch,
> > then you'd better drop the function for both NUMA and UMA.
> >
> 
> Sorry, I'm not sure I follow. In khugepaged context, logic WRT
> khugepaged_alloc_page() is unchanged - it's just called indirectly
> through ->alloc_hpage().

Ah you're right, sorry for the confusion.

> 
> > Said that, I think it's actually better to keep them, as they do things
> > useful.  For example,AFAICT this ->alloc_hpage() change can leak the hpage
> > alloacted for UMA already so that's why I think keeping them makes sense,
> > then iiuc the BUG_ON would trigger with UMA already.
> >
> > I saw that you've moved khugepaged_find_target_node() into this function
> > which looks definitely good, but if we could keep khugepaged_alloc_page()
> > and then IMHO we could even move khugepaged_find_target_node() into that
> > and drop the "node" parameter in khugepaged_alloc_page().
> >
> 
> I actually had done this, until commit 4272063db38c ("mm/khugepaged:
> sched to numa node when collapse huge page") which forced me to keep
> "node" visible in this function.

Right, I was looking at my local tree and that patch seems to be very
lately added into -next.

I'm curious why it wasn't applying to file thps too if it is worthwhile,
since if so that's also a suitable candidate to be moved into the same
hook.  I've asked in that thread instead.

Before that, feel free to keep your code as is.

> 
> > I'd even consider moving cc->gfp() all into it if possible, since the gfp
> > seems to be always with __GFP_THISNODE anyways.
> >
> 
> I would have liked to do this, but the gfp flags are referenced again
> when calling mem_cgroup_charge(), so I couldn't quite get rid of them
> from here.

Yeah, maybe we could move mem_cgroup_charge() into the hook too?  As below
codes are duplicated between file & anon and IMHO they're good candidate to
a new helper already anyway:

	/* Only allocate from the target node */
	gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;

	new_page = khugepaged_alloc_page(hpage, gfp, node);
	if (!new_page) {
		result = SCAN_ALLOC_HUGE_PAGE_FAIL;
		goto out;
	}

	if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
		result = SCAN_CGROUP_CHARGE_FAIL;
		goto out;
	}
	count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);

If we want to generalize it maybe we want to return the "result" instead of
the new page though, so the newpage can be passed in as a pointer.

There's one mmap_read_unlock(mm) for the anonymous code but I think that
can simply be moved above the chunk.

No strong opinion here, please feel free to think about what's the best
approach for landing this series.

[...]

> 
> I could (?) make .gfp of type gfp_t and just update it on every
> khugepaged scan (in case it changed) and also remove the gfp parameter
> for ->alloc_hpage().

If that's the case I'd prefer you keep you code as-is; gfp() is perfectly
fine and gfp() is light, I'm afraid that caching thing could make things
complicated.

Thanks,
Zach O'Keefe April 28, 2022, 3:37 p.m. UTC | #4
On Thu, Apr 28, 2022 at 7:51 AM Peter Xu <peterx@redhat.com> wrote:
>
> On Wed, Apr 27, 2022 at 05:51:53PM -0700, Zach O'Keefe wrote:
> > > > @@ -1110,13 +1115,14 @@ static void collapse_huge_page(struct mm_struct *mm,
> > > >        */
> > > >       mmap_read_unlock(mm);
> > > >
> > > > +     node = khugepaged_find_target_node(cc);
> > > >       /* sched to specified node before huage page memory copy */
> > > >       if (task_node(current) != node) {
> > > >               cpumask = cpumask_of_node(node);
> > > >               if (!cpumask_empty(cpumask))
> > > >                       set_cpus_allowed_ptr(current, cpumask);
> > > >       }
> > > > -     new_page = khugepaged_alloc_page(hpage, gfp, node);
> > > > +     new_page = cc->alloc_hpage(cc, gfp, node);
> > >
> > > AFAICT you removed all references of khugepaged_alloc_page() in this patch,
> > > then you'd better drop the function for both NUMA and UMA.
> > >
> >
> > Sorry, I'm not sure I follow. In khugepaged context, logic WRT
> > khugepaged_alloc_page() is unchanged - it's just called indirectly
> > through ->alloc_hpage().
>
> Ah you're right, sorry for the confusion.
>
> >
> > > Said that, I think it's actually better to keep them, as they do things
> > > useful.  For example,AFAICT this ->alloc_hpage() change can leak the hpage
> > > alloacted for UMA already so that's why I think keeping them makes sense,
> > > then iiuc the BUG_ON would trigger with UMA already.
> > >
> > > I saw that you've moved khugepaged_find_target_node() into this function
> > > which looks definitely good, but if we could keep khugepaged_alloc_page()
> > > and then IMHO we could even move khugepaged_find_target_node() into that
> > > and drop the "node" parameter in khugepaged_alloc_page().
> > >
> >
> > I actually had done this, until commit 4272063db38c ("mm/khugepaged:
> > sched to numa node when collapse huge page") which forced me to keep
> > "node" visible in this function.
>
> Right, I was looking at my local tree and that patch seems to be very
> lately added into -next.
>
> I'm curious why it wasn't applying to file thps too if it is worthwhile,
> since if so that's also a suitable candidate to be moved into the same
> hook.  I've asked in that thread instead.
>
> Before that, feel free to keep your code as is.
>

Thanks for checking on that. On second thought, it seems like we would
actually want to move this sched logic into the khupaged hook impl
since we probably don't want to be moving around the calling process
if in madvise context.

> >
> > > I'd even consider moving cc->gfp() all into it if possible, since the gfp
> > > seems to be always with __GFP_THISNODE anyways.
> > >
> >
> > I would have liked to do this, but the gfp flags are referenced again
> > when calling mem_cgroup_charge(), so I couldn't quite get rid of them
> > from here.
>
> Yeah, maybe we could move mem_cgroup_charge() into the hook too?  As below
> codes are duplicated between file & anon and IMHO they're good candidate to
> a new helper already anyway:
>
>         /* Only allocate from the target node */
>         gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
>
>         new_page = khugepaged_alloc_page(hpage, gfp, node);
>         if (!new_page) {
>                 result = SCAN_ALLOC_HUGE_PAGE_FAIL;
>                 goto out;
>         }
>
>         if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
>                 result = SCAN_CGROUP_CHARGE_FAIL;
>                 goto out;
>         }
>         count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
>
> If we want to generalize it maybe we want to return the "result" instead of
> the new page though, so the newpage can be passed in as a pointer.
>
> There's one mmap_read_unlock(mm) for the anonymous code but I think that
> can simply be moved above the chunk.
>
> No strong opinion here, please feel free to think about what's the best
> approach for landing this series.
>

Thanks for the suggestion. I'll play around with it a little and see
what looks good.

> [...]
>
> >
> > I could (?) make .gfp of type gfp_t and just update it on every
> > khugepaged scan (in case it changed) and also remove the gfp parameter
> > for ->alloc_hpage().
>
> If that's the case I'd prefer you keep you code as-is; gfp() is perfectly
> fine and gfp() is light, I'm afraid that caching thing could make things
> complicated.
>

Ack.

Thanks again for your time and thoughts, Peter!

Best,
Zach

> Thanks,
>
> --
> Peter Xu
>
Peter Xu April 28, 2022, 3:52 p.m. UTC | #5
On Thu, Apr 28, 2022 at 08:37:06AM -0700, Zach O'Keefe wrote:
> On Thu, Apr 28, 2022 at 7:51 AM Peter Xu <peterx@redhat.com> wrote:
> >
> > On Wed, Apr 27, 2022 at 05:51:53PM -0700, Zach O'Keefe wrote:
> > > > > @@ -1110,13 +1115,14 @@ static void collapse_huge_page(struct mm_struct *mm,
> > > > >        */
> > > > >       mmap_read_unlock(mm);
> > > > >
> > > > > +     node = khugepaged_find_target_node(cc);
> > > > >       /* sched to specified node before huage page memory copy */
> > > > >       if (task_node(current) != node) {
> > > > >               cpumask = cpumask_of_node(node);
> > > > >               if (!cpumask_empty(cpumask))
> > > > >                       set_cpus_allowed_ptr(current, cpumask);
> > > > >       }
> > > > > -     new_page = khugepaged_alloc_page(hpage, gfp, node);
> > > > > +     new_page = cc->alloc_hpage(cc, gfp, node);
> > > >
> > > > AFAICT you removed all references of khugepaged_alloc_page() in this patch,
> > > > then you'd better drop the function for both NUMA and UMA.
> > > >
> > >
> > > Sorry, I'm not sure I follow. In khugepaged context, logic WRT
> > > khugepaged_alloc_page() is unchanged - it's just called indirectly
> > > through ->alloc_hpage().
> >
> > Ah you're right, sorry for the confusion.
> >
> > >
> > > > Said that, I think it's actually better to keep them, as they do things
> > > > useful.  For example,AFAICT this ->alloc_hpage() change can leak the hpage
> > > > alloacted for UMA already so that's why I think keeping them makes sense,
> > > > then iiuc the BUG_ON would trigger with UMA already.
> > > >
> > > > I saw that you've moved khugepaged_find_target_node() into this function
> > > > which looks definitely good, but if we could keep khugepaged_alloc_page()
> > > > and then IMHO we could even move khugepaged_find_target_node() into that
> > > > and drop the "node" parameter in khugepaged_alloc_page().
> > > >
> > >
> > > I actually had done this, until commit 4272063db38c ("mm/khugepaged:
> > > sched to numa node when collapse huge page") which forced me to keep
> > > "node" visible in this function.
> >
> > Right, I was looking at my local tree and that patch seems to be very
> > lately added into -next.
> >
> > I'm curious why it wasn't applying to file thps too if it is worthwhile,
> > since if so that's also a suitable candidate to be moved into the same
> > hook.  I've asked in that thread instead.
> >
> > Before that, feel free to keep your code as is.
> >
> 
> Thanks for checking on that. On second thought, it seems like we would
> actually want to move this sched logic into the khupaged hook impl
> since we probably don't want to be moving around the calling process
> if in madvise context.

Sounds correct, if it's moved over it must only be in the new helper (if
you're going to introduce it, like below :) that was only for khugepaged.

Actually I really start to question that idea more..  e.g. even for
khugepaged the target node can be changing rapidly depending on the owner
of pages planned to collapse.

Whether does it make sense to bounce khugepaged thread itself seems still
questionable to me, and definitely not a good idea for madvise() context.
The only proof in that patch was a whole testbench result but I'm not sure
how reliable that'll be when applied to real world scenarios.

> 
> > >
> > > > I'd even consider moving cc->gfp() all into it if possible, since the gfp
> > > > seems to be always with __GFP_THISNODE anyways.
> > > >
> > >
> > > I would have liked to do this, but the gfp flags are referenced again
> > > when calling mem_cgroup_charge(), so I couldn't quite get rid of them
> > > from here.
> >
> > Yeah, maybe we could move mem_cgroup_charge() into the hook too?  As below
> > codes are duplicated between file & anon and IMHO they're good candidate to
> > a new helper already anyway:
> >
> >         /* Only allocate from the target node */
> >         gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
> >
> >         new_page = khugepaged_alloc_page(hpage, gfp, node);
> >         if (!new_page) {
> >                 result = SCAN_ALLOC_HUGE_PAGE_FAIL;
> >                 goto out;
> >         }
> >
> >         if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
> >                 result = SCAN_CGROUP_CHARGE_FAIL;
> >                 goto out;
> >         }
> >         count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
> >
> > If we want to generalize it maybe we want to return the "result" instead of
> > the new page though, so the newpage can be passed in as a pointer.
> >
> > There's one mmap_read_unlock(mm) for the anonymous code but I think that
> > can simply be moved above the chunk.
> >
> > No strong opinion here, please feel free to think about what's the best
> > approach for landing this series.
> >
> 
> Thanks for the suggestion. I'll play around with it a little and see
> what looks good.

Sounds good!

If the new helper would be welcomed then it can be a standalone
pre-requisite patch to clean things up first.  Let's also see how it goes
with the other patch, because there's chance you can also cleanup
khugepaged_find_target_node() in the same patch along with all above.
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 9d42fa330812..c4962191d6e1 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -92,6 +92,11 @@  struct collapse_control {
 
 	/* Last target selected in khugepaged_find_target_node() for this scan */
 	int last_target_node;
+
+	struct page *hpage;
+	gfp_t (*gfp)(void);
+	struct page* (*alloc_hpage)(struct collapse_control *cc, gfp_t gfp,
+				    int node);
 };
 
 /**
@@ -877,21 +882,21 @@  static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
 	return true;
 }
 
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
+static struct page *khugepaged_alloc_page(struct collapse_control *cc,
+					  gfp_t gfp, int node)
 {
-	VM_BUG_ON_PAGE(*hpage, *hpage);
+	VM_BUG_ON_PAGE(cc->hpage, cc->hpage);
 
-	*hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
-	if (unlikely(!*hpage)) {
+	cc->hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
+	if (unlikely(!cc->hpage)) {
 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-		*hpage = ERR_PTR(-ENOMEM);
+		cc->hpage = ERR_PTR(-ENOMEM);
 		return NULL;
 	}
 
-	prep_transhuge_page(*hpage);
+	prep_transhuge_page(cc->hpage);
 	count_vm_event(THP_COLLAPSE_ALLOC);
-	return *hpage;
+	return cc->hpage;
 }
 #else
 static int khugepaged_find_target_node(struct collapse_control *cc)
@@ -953,12 +958,12 @@  static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
 	return true;
 }
 
-static struct page *
-khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
+static struct page *khugepaged_alloc_page(struct collapse_control *cc,
+					  gfp_t gfp, int node)
 {
-	VM_BUG_ON(!*hpage);
+	VM_BUG_ON(!cc->hpage);
 
-	return  *hpage;
+	return cc->hpage;
 }
 #endif
 
@@ -1080,10 +1085,9 @@  static bool __collapse_huge_page_swapin(struct mm_struct *mm,
 	return true;
 }
 
-static void collapse_huge_page(struct mm_struct *mm,
-				   unsigned long address,
-				   struct page **hpage,
-				   int node, int referenced, int unmapped)
+static void collapse_huge_page(struct mm_struct *mm, unsigned long address,
+			       struct collapse_control *cc, int referenced,
+			       int unmapped)
 {
 	LIST_HEAD(compound_pagelist);
 	pmd_t *pmd, _pmd;
@@ -1096,11 +1100,12 @@  static void collapse_huge_page(struct mm_struct *mm,
 	struct mmu_notifier_range range;
 	gfp_t gfp;
 	const struct cpumask *cpumask;
+	int node;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
 	/* Only allocate from the target node */
-	gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
+	gfp = cc->gfp() | __GFP_THISNODE;
 
 	/*
 	 * Before allocating the hugepage, release the mmap_lock read lock.
@@ -1110,13 +1115,14 @@  static void collapse_huge_page(struct mm_struct *mm,
 	 */
 	mmap_read_unlock(mm);
 
+	node = khugepaged_find_target_node(cc);
 	/* sched to specified node before huage page memory copy */
 	if (task_node(current) != node) {
 		cpumask = cpumask_of_node(node);
 		if (!cpumask_empty(cpumask))
 			set_cpus_allowed_ptr(current, cpumask);
 	}
-	new_page = khugepaged_alloc_page(hpage, gfp, node);
+	new_page = cc->alloc_hpage(cc, gfp, node);
 	if (!new_page) {
 		result = SCAN_ALLOC_HUGE_PAGE_FAIL;
 		goto out_nolock;
@@ -1238,15 +1244,15 @@  static void collapse_huge_page(struct mm_struct *mm,
 	update_mmu_cache_pmd(vma, address, pmd);
 	spin_unlock(pmd_ptl);
 
-	*hpage = NULL;
+	cc->hpage = NULL;
 
 	khugepaged_pages_collapsed++;
 	result = SCAN_SUCCEED;
 out_up_write:
 	mmap_write_unlock(mm);
 out_nolock:
-	if (!IS_ERR_OR_NULL(*hpage))
-		mem_cgroup_uncharge(page_folio(*hpage));
+	if (!IS_ERR_OR_NULL(cc->hpage))
+		mem_cgroup_uncharge(page_folio(cc->hpage));
 	trace_mm_collapse_huge_page(mm, isolated, result);
 	return;
 }
@@ -1254,7 +1260,6 @@  static void collapse_huge_page(struct mm_struct *mm,
 static int khugepaged_scan_pmd(struct mm_struct *mm,
 			       struct vm_area_struct *vma,
 			       unsigned long address,
-			       struct page **hpage,
 			       struct collapse_control *cc)
 {
 	pmd_t *pmd;
@@ -1399,10 +1404,8 @@  static int khugepaged_scan_pmd(struct mm_struct *mm,
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
 	if (ret) {
-		node = khugepaged_find_target_node(cc);
 		/* collapse_huge_page will return with the mmap_lock released */
-		collapse_huge_page(mm, address, hpage, node,
-				referenced, unmapped);
+		collapse_huge_page(mm, address, cc, referenced, unmapped);
 	}
 out:
 	trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
@@ -1667,8 +1670,7 @@  static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
  * @mm: process address space where collapse happens
  * @file: file that collapse on
  * @start: collapse start address
- * @hpage: new allocated huge page for collapse
- * @node: appointed node the new huge page allocate from
+ * @cc: collapse context and scratchpad
  *
  * Basic scheme is simple, details are more complex:
  *  - allocate and lock a new huge page;
@@ -1686,8 +1688,8 @@  static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
  *    + unlock and free huge page;
  */
 static void collapse_file(struct mm_struct *mm,
-		struct file *file, pgoff_t start,
-		struct page **hpage, int node)
+			  struct file *file, pgoff_t start,
+			  struct collapse_control *cc)
 {
 	struct address_space *mapping = file->f_mapping;
 	gfp_t gfp;
@@ -1697,15 +1699,16 @@  static void collapse_file(struct mm_struct *mm,
 	XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
 	int nr_none = 0, result = SCAN_SUCCEED;
 	bool is_shmem = shmem_file(file);
-	int nr;
+	int nr, node;
 
 	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
 	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
 
 	/* Only allocate from the target node */
-	gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
+	gfp = cc->gfp() | __GFP_THISNODE;
+	node = khugepaged_find_target_node(cc);
 
-	new_page = khugepaged_alloc_page(hpage, gfp, node);
+	new_page = cc->alloc_hpage(cc, gfp, node);
 	if (!new_page) {
 		result = SCAN_ALLOC_HUGE_PAGE_FAIL;
 		goto out;
@@ -1998,7 +2001,7 @@  static void collapse_file(struct mm_struct *mm,
 		 * Remove pte page tables, so we can re-fault the page as huge.
 		 */
 		retract_page_tables(mapping, start);
-		*hpage = NULL;
+		cc->hpage = NULL;
 
 		khugepaged_pages_collapsed++;
 	} else {
@@ -2045,14 +2048,14 @@  static void collapse_file(struct mm_struct *mm,
 	unlock_page(new_page);
 out:
 	VM_BUG_ON(!list_empty(&pagelist));
-	if (!IS_ERR_OR_NULL(*hpage))
-		mem_cgroup_uncharge(page_folio(*hpage));
+	if (!IS_ERR_OR_NULL(cc->hpage))
+		mem_cgroup_uncharge(page_folio(cc->hpage));
 	/* TODO: tracepoints */
 }
 
 static void khugepaged_scan_file(struct mm_struct *mm,
-		struct file *file, pgoff_t start, struct page **hpage,
-		struct collapse_control *cc)
+				 struct file *file, pgoff_t start,
+				 struct collapse_control *cc)
 {
 	struct page *page = NULL;
 	struct address_space *mapping = file->f_mapping;
@@ -2125,8 +2128,7 @@  static void khugepaged_scan_file(struct mm_struct *mm,
 			result = SCAN_EXCEED_NONE_PTE;
 			count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
 		} else {
-			node = khugepaged_find_target_node(cc);
-			collapse_file(mm, file, start, hpage, node);
+			collapse_file(mm, file, start, cc);
 		}
 	}
 
@@ -2134,8 +2136,8 @@  static void khugepaged_scan_file(struct mm_struct *mm,
 }
 #else
 static void khugepaged_scan_file(struct mm_struct *mm,
-		struct file *file, pgoff_t start, struct page **hpage,
-		struct collapse_control *cc)
+				 struct file *file, pgoff_t start,
+				 struct collapse_control *cc)
 {
 	BUILD_BUG();
 }
@@ -2146,7 +2148,6 @@  static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
 #endif
 
 static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
-					    struct page **hpage,
 					    struct collapse_control *cc)
 	__releases(&khugepaged_mm_lock)
 	__acquires(&khugepaged_mm_lock)
@@ -2223,12 +2224,11 @@  static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
 
 				mmap_read_unlock(mm);
 				ret = 1;
-				khugepaged_scan_file(mm, file, pgoff, hpage, cc);
+				khugepaged_scan_file(mm, file, pgoff, cc);
 				fput(file);
 			} else {
 				ret = khugepaged_scan_pmd(mm, vma,
-						khugepaged_scan.address,
-						hpage, cc);
+						khugepaged_scan.address, cc);
 			}
 			/* move to next address */
 			khugepaged_scan.address += HPAGE_PMD_SIZE;
@@ -2286,15 +2286,15 @@  static int khugepaged_wait_event(void)
 
 static void khugepaged_do_scan(struct collapse_control *cc)
 {
-	struct page *hpage = NULL;
 	unsigned int progress = 0, pass_through_head = 0;
 	unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
 	bool wait = true;
 
+	cc->hpage = NULL;
 	lru_add_drain_all();
 
 	while (progress < pages) {
-		if (!khugepaged_prealloc_page(&hpage, &wait))
+		if (!khugepaged_prealloc_page(&cc->hpage, &wait))
 			break;
 
 		cond_resched();
@@ -2308,14 +2308,14 @@  static void khugepaged_do_scan(struct collapse_control *cc)
 		if (khugepaged_has_work() &&
 		    pass_through_head < 2)
 			progress += khugepaged_scan_mm_slot(pages - progress,
-							    &hpage, cc);
+							    cc);
 		else
 			progress = pages;
 		spin_unlock(&khugepaged_mm_lock);
 	}
 
-	if (!IS_ERR_OR_NULL(hpage))
-		put_page(hpage);
+	if (!IS_ERR_OR_NULL(cc->hpage))
+		put_page(cc->hpage);
 }
 
 static bool khugepaged_should_wakeup(void)
@@ -2349,6 +2349,8 @@  static int khugepaged(void *none)
 	struct mm_slot *mm_slot;
 	struct collapse_control cc = {
 		.last_target_node = NUMA_NO_NODE,
+		.gfp = &alloc_hugepage_khugepaged_gfpmask,
+		.alloc_hpage = &khugepaged_alloc_page,
 	};
 
 	set_freezable();