diff mbox series

[RFC,05/12] khugepaged: Generalize __collapse_huge_page_isolate()

Message ID 20241216165105.56185-6-dev.jain@arm.com (mailing list archive)
State New
Headers show
Series khugepaged: Asynchronous mTHP collapse | expand

Commit Message

Dev Jain Dec. 16, 2024, 4:50 p.m. UTC
Scale down the scan range and the sysfs tunables according to the scan order,
and isolate the folios.

Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 mm/khugepaged.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

Comments

Matthew Wilcox (Oracle) Dec. 17, 2024, 4:32 a.m. UTC | #1
On Mon, Dec 16, 2024 at 10:20:58PM +0530, Dev Jain wrote:
>  {
> -	struct page *page = NULL;
> -	struct folio *folio = NULL;
> -	pte_t *_pte;
> +	unsigned int max_ptes_shared = khugepaged_max_ptes_shared >> (HPAGE_PMD_ORDER - order);
> +	unsigned int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
>  	int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
> +	struct folio *folio = NULL;
> +	struct page *page = NULL;

why are you moving variables around unnecessarily?

>  	bool writable = false;
> +	pte_t *_pte;
>  
> -	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> +
> +	for (_pte = pte; _pte < pte + (1UL << order);

spurious blank line


also you might first want to finish off the page->folio conversion in
this function first; we have a vm_normal_folio() now.
Dev Jain Dec. 17, 2024, 6:41 a.m. UTC | #2
On 17/12/24 10:02 am, Matthew Wilcox wrote:
> On Mon, Dec 16, 2024 at 10:20:58PM +0530, Dev Jain wrote:
>>   {
>> -	struct page *page = NULL;
>> -	struct folio *folio = NULL;
>> -	pte_t *_pte;
>> +	unsigned int max_ptes_shared = khugepaged_max_ptes_shared >> (HPAGE_PMD_ORDER - order);
>> +	unsigned int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
>>   	int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
>> +	struct folio *folio = NULL;
>> +	struct page *page = NULL;
> why are you moving variables around unnecessarily?

In a previous work, I moved code around and David noted to arrange the declarations
in reverse Xmas tree order. I guess (?) that was not spoiling git history, so if
this feels like that, I will revert.

>
>>   	bool writable = false;
>> +	pte_t *_pte;
>>   
>> -	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
>> +
>> +	for (_pte = pte; _pte < pte + (1UL << order);
> spurious blank line

My bad

>
>
> also you might first want to finish off the page->folio conversion in
> this function first; we have a vm_normal_folio() now.

I did not add any code before we derive the folio...I'm sorry, I don't get what you mean...
Ryan Roberts Dec. 17, 2024, 5:09 p.m. UTC | #3
On 16/12/2024 16:50, Dev Jain wrote:
> Scale down the scan range and the sysfs tunables according to the scan order,
> and isolate the folios.
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  mm/khugepaged.c | 19 +++++++++++--------
>  1 file changed, 11 insertions(+), 8 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index f52dae7d5179..de044b1f83d4 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -564,15 +564,18 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  					unsigned long address,
>  					pte_t *pte,
>  					struct collapse_control *cc,
> -					struct list_head *compound_pagelist)
> +					struct list_head *compound_pagelist, int order)
>  {
> -	struct page *page = NULL;
> -	struct folio *folio = NULL;
> -	pte_t *_pte;
> +	unsigned int max_ptes_shared = khugepaged_max_ptes_shared >> (HPAGE_PMD_ORDER - order);
> +	unsigned int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);

This is implicitly rounding down. I think that's the right thing to do; it's
better to be conservative.

>  	int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
> +	struct folio *folio = NULL;
> +	struct page *page = NULL;
>  	bool writable = false;
> +	pte_t *_pte;
>  
> -	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> +
> +	for (_pte = pte; _pte < pte + (1UL << order);
>  	     _pte++, address += PAGE_SIZE) {
>  		pte_t pteval = ptep_get(_pte);
>  		if (pte_none(pteval) || (pte_present(pteval) &&
> @@ -580,7 +583,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  			++none_or_zero;
>  			if (!userfaultfd_armed(vma) &&
>  			    (!cc->is_khugepaged ||
> -			     none_or_zero <= khugepaged_max_ptes_none)) {
> +			     none_or_zero <= max_ptes_none)) {
>  				continue;
>  			} else {
>  				result = SCAN_EXCEED_NONE_PTE;
> @@ -609,7 +612,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  		if (folio_likely_mapped_shared(folio)) {
>  			++shared;
>  			if (cc->is_khugepaged &&
> -			    shared > khugepaged_max_ptes_shared) {
> +			    shared > max_ptes_shared) {
>  				result = SCAN_EXCEED_SHARED_PTE;
>  				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
>  				goto out;
> @@ -1200,7 +1203,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>  	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
>  	if (pte) {
>  		result = __collapse_huge_page_isolate(vma, address, pte, cc,
> -						      &compound_pagelist);
> +						      &compound_pagelist, order);
>  		spin_unlock(pte_ptl);
>  	} else {
>  		result = SCAN_PMD_NULL;
Ryan Roberts Dec. 17, 2024, 5:14 p.m. UTC | #4
On 17/12/2024 06:41, Dev Jain wrote:
> 
> On 17/12/24 10:02 am, Matthew Wilcox wrote:
>> On Mon, Dec 16, 2024 at 10:20:58PM +0530, Dev Jain wrote:
>>>   {
>>> -    struct page *page = NULL;
>>> -    struct folio *folio = NULL;
>>> -    pte_t *_pte;
>>> +    unsigned int max_ptes_shared = khugepaged_max_ptes_shared >>
>>> (HPAGE_PMD_ORDER - order);
>>> +    unsigned int max_ptes_none = khugepaged_max_ptes_none >>
>>> (HPAGE_PMD_ORDER - order);
>>>       int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
>>> +    struct folio *folio = NULL;
>>> +    struct page *page = NULL;
>> why are you moving variables around unnecessarily?
> 
> In a previous work, I moved code around and David noted to arrange the declarations
> in reverse Xmas tree order. I guess (?) that was not spoiling git history, so if
> this feels like that, I will revert.
> 
>>
>>>       bool writable = false;
>>> +    pte_t *_pte;
>>>   -    for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
>>> +
>>> +    for (_pte = pte; _pte < pte + (1UL << order);
>> spurious blank line
> 
> My bad
> 
>>
>>
>> also you might first want to finish off the page->folio conversion in
>> this function first; we have a vm_normal_folio() now.
> 
> I did not add any code before we derive the folio...I'm sorry, I don't get what
> you mean...
> 

I think Matthew is suggesting helping out with the folio to page conversion work
while you are working on this function. I think it would amount to a patch that
does something like this:

----8<-----
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ffc4d5aef991..d94e05754140 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -568,7 +568,6 @@ static int __collapse_huge_page_isolate(struct
vm_area_struct *vma,
        unsigned int max_ptes_none = khugepaged_max_ptes_none >>
(HPAGE_PMD_ORDER - order);
        int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
        struct folio *folio = NULL;
-       struct page *page = NULL;
        bool writable = false;
        pte_t *_pte;

@@ -597,13 +596,12 @@ static int __collapse_huge_page_isolate(struct
vm_area_struct *vma,
                        result = SCAN_PTE_UFFD_WP;
                        goto out;
                }
-               page = vm_normal_page(vma, address, pteval);
-               if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
+               folio = vm_normal_folio(vma, address, pteval);
+               if (unlikely(!folio) || unlikely(folio_is_zone_device(folio))) {
                        result = SCAN_PAGE_NULL;
                        goto out;
                }

-               folio = page_folio(page);
                VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);

                if (order !=HPAGE_PMD_ORDER && folio_order(folio) >= order) {
----8<-----
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index f52dae7d5179..de044b1f83d4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -564,15 +564,18 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 					unsigned long address,
 					pte_t *pte,
 					struct collapse_control *cc,
-					struct list_head *compound_pagelist)
+					struct list_head *compound_pagelist, int order)
 {
-	struct page *page = NULL;
-	struct folio *folio = NULL;
-	pte_t *_pte;
+	unsigned int max_ptes_shared = khugepaged_max_ptes_shared >> (HPAGE_PMD_ORDER - order);
+	unsigned int max_ptes_none = khugepaged_max_ptes_none >> (HPAGE_PMD_ORDER - order);
 	int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
+	struct folio *folio = NULL;
+	struct page *page = NULL;
 	bool writable = false;
+	pte_t *_pte;
 
-	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+
+	for (_pte = pte; _pte < pte + (1UL << order);
 	     _pte++, address += PAGE_SIZE) {
 		pte_t pteval = ptep_get(_pte);
 		if (pte_none(pteval) || (pte_present(pteval) &&
@@ -580,7 +583,7 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			++none_or_zero;
 			if (!userfaultfd_armed(vma) &&
 			    (!cc->is_khugepaged ||
-			     none_or_zero <= khugepaged_max_ptes_none)) {
+			     none_or_zero <= max_ptes_none)) {
 				continue;
 			} else {
 				result = SCAN_EXCEED_NONE_PTE;
@@ -609,7 +612,7 @@  static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		if (folio_likely_mapped_shared(folio)) {
 			++shared;
 			if (cc->is_khugepaged &&
-			    shared > khugepaged_max_ptes_shared) {
+			    shared > max_ptes_shared) {
 				result = SCAN_EXCEED_SHARED_PTE;
 				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
 				goto out;
@@ -1200,7 +1203,7 @@  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
 	if (pte) {
 		result = __collapse_huge_page_isolate(vma, address, pte, cc,
-						      &compound_pagelist);
+						      &compound_pagelist, order);
 		spin_unlock(pte_ptl);
 	} else {
 		result = SCAN_PMD_NULL;