diff mbox series

[v2,1/2] mm/khugepaged: attempt to map anonymous pte-mapped THPs by pmds

Message ID 0919956ecd2b7052fa308a93397fd1e85806e091.1701917546.git.xuyu@linux.alibaba.com (mailing list archive)
State New
Headers show
Series attempt to map anonymous pte-mapped THPs by pmds | expand

Commit Message

Xu Yu Dec. 7, 2023, 3:09 a.m. UTC
In the anonymous collapse path, khugepaged always collapses pte-mapped
hugepage by allocating and copying to a new hugepage.

In some scenarios, we can only update the mapping page tables for
anonymous pte-mapped THPs, in the same way as file/shmem-backed
pte-mapped THPs, as shown in commit 58ac9a8993a1 ("mm/khugepaged:
attempt to map file/shmem-backed pte-mapped THPs by pmds")

The simplest scenario that satisfies the conditions, as David points out,
is when no subpages are PageAnonExclusive (PTEs must be R/O), we can
collapse into a R/O PMD without further action.

Let's start from this simplest scenario.

Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
---
 mm/khugepaged.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 214 insertions(+)

Comments

Xu Yu Dec. 7, 2023, 7:47 a.m. UTC | #1
On 12/7/23 11:09 AM, Xu Yu wrote:
> In the anonymous collapse path, khugepaged always collapses pte-mapped
> hugepage by allocating and copying to a new hugepage.
> 
> In some scenarios, we can only update the mapping page tables for
> anonymous pte-mapped THPs, in the same way as file/shmem-backed
> pte-mapped THPs, as shown in commit 58ac9a8993a1 ("mm/khugepaged:
> attempt to map file/shmem-backed pte-mapped THPs by pmds")
> 
> The simplest scenario that satisfies the conditions, as David points out,
> is when no subpages are PageAnonExclusive (PTEs must be R/O), we can
> collapse into a R/O PMD without further action.
> 
> Let's start from this simplest scenario.
> 
> Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
> ---
>   mm/khugepaged.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 214 insertions(+)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 88433cc25d8a..85c7a2ab44ce 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1237,6 +1237,197 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>   	return result;
>   }
>   
> +static struct page *find_lock_pte_mapped_page(struct vm_area_struct *vma,
> +					      unsigned long addr, pmd_t *pmd)
> +{
> +	pte_t *pte, pteval;
> +	struct page *page = NULL;
> +
> +	pte = pte_offset_map(pmd, addr);
> +	if (!pte)
> +		return NULL;
> +
> +	pteval = ptep_get_lockless(pte);
> +	if (pte_none(pteval) || !pte_present(pteval))
> +		goto out;
> +
> +	page = vm_normal_page(vma, addr, pteval);
> +	if (unlikely(!page) || unlikely(is_zone_device_page(page)))
> +		goto out;
> +
> +	page = compound_head(page);
> +
> +	if (!trylock_page(page)) {
> +		page = NULL;
> +		goto out;
> +	}
> +
> +	if (!get_page_unless_zero(page)) {
> +		unlock_page(page);
> +		page = NULL;
> +		goto out;
> +	}
> +
> +out:
> +	pte_unmap(pte);
> +	return page;
> +}
> +
> +static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
> +				struct vm_area_struct *vma,
> +				unsigned long haddr, bool *mmap_locked,
> +				struct collapse_control *cc)
> +{
> +	struct mmu_notifier_range range;
> +	struct page *hpage;
> +	pte_t *start_pte, *pte;
> +	pmd_t *pmd, pmdval;
> +	spinlock_t *pml, *ptl;
> +	pgtable_t pgtable;
> +	unsigned long addr;
> +	int exclusive = 0;
> +	bool writable = false;
> +	int result, i;
> +
> +	/* Fast check before locking page if already PMD-mapped */
> +	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
> +	if (result == SCAN_PMD_MAPPED)
> +		return result;
> +
> +	hpage = find_lock_pte_mapped_page(vma, haddr, pmd);
> +	if (!hpage)
> +		return SCAN_PAGE_NULL;
> +	if (!PageHead(hpage)) {
> +		result = SCAN_FAIL;
> +		goto drop_hpage;
> +	}
> +	if (compound_order(hpage) != HPAGE_PMD_ORDER) {
> +		result = SCAN_PAGE_COMPOUND;
> +		goto drop_hpage;
> +	}
> +
> +	mmap_read_unlock(mm);
> +	*mmap_locked = false;
> +
> +	/* Prevent all access to pagetables */
> +	mmap_write_lock(mm);
> +
> +	result = hugepage_vma_revalidate(mm, haddr, true, &vma, cc);
> +	if (result != SCAN_SUCCEED)
> +		goto up_write;
> +
> +	result = check_pmd_still_valid(mm, haddr, pmd);
> +	if (result != SCAN_SUCCEED)
> +		goto up_write;
> +
> +	/* Recheck with mmap write lock */
> +	result = SCAN_SUCCEED;
> +	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
> +	if (!start_pte)
> +		goto drop_hpage;
                      ^^^^^^^^^^ should be up_write.
> +	for (i = 0, addr = haddr, pte = start_pte;
> +	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
> +		struct page *page;
> +		pte_t pteval = ptep_get(pte);
> +
> +		if (pte_none(pteval) || !pte_present(pteval)) {
> +			result = SCAN_PTE_NON_PRESENT;
> +			break;
> +		}
> +
> +		if (pte_uffd_wp(pteval)) {
> +			result = SCAN_PTE_UFFD_WP;
> +			break;
> +		}
> +
> +		if (pte_write(pteval))
> +			writable = true;
> +
> +		page = vm_normal_page(vma, addr, pteval);
> +
> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
> +			result = SCAN_PAGE_NULL;
> +			break;
> +		}
> +
> +		if (hpage + i != page) {
> +			result = SCAN_FAIL;
> +			break;
> +		}
> +
> +		if (PageAnonExclusive(page))
> +			exclusive++;
> +	}
> +	pte_unmap_unlock(start_pte, ptl);
> +	if (result != SCAN_SUCCEED)
> +		goto drop_hpage;
                      ^^^^^^^^^^ should be up_write.
> +
> +	/*
> +	 * Case 1:
> +	 * No subpages are PageAnonExclusive (PTEs must be R/O), we can
> +	 * collapse into a R/O PMD without further action.
> +	 */
> +	if (!(exclusive == 0 && !writable))
> +		goto drop_hpage;
                      ^^^^^^^^^^ should be up_write.
> +
> +	/* Collapse pmd entry */
> +	vma_start_write(vma);
> +	anon_vma_lock_write(vma->anon_vma);
> +
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
> +				haddr, haddr + HPAGE_PMD_SIZE);
> +	mmu_notifier_invalidate_range_start(&range);
> +
> +	pml = pmd_lock(mm, pmd); /* probably unnecessary */
> +	pmdval = pmdp_collapse_flush(vma, haddr, pmd);
> +	spin_unlock(pml);
> +	mmu_notifier_invalidate_range_end(&range);
> +	tlb_remove_table_sync_one();
> +
> +	anon_vma_unlock_write(vma->anon_vma);
> +
> +	start_pte = pte_offset_map_lock(mm, &pmdval, haddr, &ptl);
> +	if (!start_pte)
> +		goto rollback;
> +	for (i = 0, addr = haddr, pte = start_pte;
> +	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
> +		struct page *page;
> +		pte_t pteval = ptep_get(pte);
> +
> +		page = vm_normal_page(vma, addr, pteval);
> +		page_remove_rmap(page, vma, false);
> +	}
> +	pte_unmap_unlock(start_pte, ptl);
> +
> +	/* Install pmd entry */
> +	pgtable = pmd_pgtable(pmdval);
> +	pmdval = mk_huge_pmd(hpage, vma->vm_page_prot);
> +	spin_lock(pml);
> +	page_add_anon_rmap(hpage, vma, haddr, RMAP_COMPOUND);
> +	pgtable_trans_huge_deposit(mm, pmd, pgtable);
> +	set_pmd_at(mm, haddr, pmd, pmdval);
> +	update_mmu_cache_pmd(vma, haddr, pmd);
> +	spin_unlock(pml);
> +
> +	result = SCAN_SUCCEED;
> +	goto up_write;
> +
> +rollback:
> +	spin_lock(pml);
> +	pmd_populate(mm, pmd, pmd_pgtable(pmdval));
> +	spin_unlock(pml);
> +
> +up_write:
> +	mmap_write_unlock(mm);
> +
> +drop_hpage:
> +	unlock_page(hpage);
> +	put_page(hpage);
> +
> +	/* TODO: tracepoints */
> +	return result;
> +}
> +
>   static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>   				   struct vm_area_struct *vma,
>   				   unsigned long address, bool *mmap_locked,
> @@ -1251,6 +1442,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>   	spinlock_t *ptl;
>   	int node = NUMA_NO_NODE, unmapped = 0;
>   	bool writable = false;
> +	int exclusive = 0;
> +	bool is_hpage = false;
>   
>   	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
>   
> @@ -1333,8 +1526,14 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>   			}
>   		}
>   
> +		if (PageAnonExclusive(page))
> +			exclusive++;
> +
>   		page = compound_head(page);
>   
> +		if (compound_order(page) == HPAGE_PMD_ORDER)
> +			is_hpage = true;
> +
>   		/*
>   		 * Record which node the original page is from and save this
>   		 * information to cc->node_load[].
> @@ -1396,7 +1595,22 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
>   	}
>   out_unmap:
>   	pte_unmap_unlock(pte, ptl);
> +
> +	if (is_hpage && (exclusive == 0 && !writable)) {
> +		int res;
> +
> +		res = collapse_pte_mapped_anon_thp(mm, vma, address,
> +						   mmap_locked, cc);
> +		if (res == SCAN_PMD_MAPPED || res == SCAN_SUCCEED) {
> +			result = res;
> +			goto out;
> +		}
> +
> +	}
> +
>   	if (result == SCAN_SUCCEED) {
> +		if (!*mmap_locked)
> +			mmap_read_lock(mm);
>   		result = collapse_huge_page(mm, address, referenced,
>   					    unmapped, cc);
>   		/* collapse_huge_page will return with the mmap_lock released */
David Hildenbrand Dec. 7, 2023, 10:37 a.m. UTC | #2
On 07.12.23 04:09, Xu Yu wrote:
> In the anonymous collapse path, khugepaged always collapses pte-mapped
> hugepage by allocating and copying to a new hugepage.
> 
> In some scenarios, we can only update the mapping page tables for
> anonymous pte-mapped THPs, in the same way as file/shmem-backed
> pte-mapped THPs, as shown in commit 58ac9a8993a1 ("mm/khugepaged:
> attempt to map file/shmem-backed pte-mapped THPs by pmds")
> 
> The simplest scenario that satisfies the conditions, as David points out,
> is when no subpages are PageAnonExclusive (PTEs must be R/O), we can
> collapse into a R/O PMD without further action.
> 
> Let's start from this simplest scenario.
> 
> Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
> ---
>   mm/khugepaged.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 214 insertions(+)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 88433cc25d8a..85c7a2ab44ce 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1237,6 +1237,197 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>   	return result;
>   }
>   
> +static struct page *find_lock_pte_mapped_page(struct vm_area_struct *vma,
> +					      unsigned long addr, pmd_t *pmd)
> +{
> +	pte_t *pte, pteval;
> +	struct page *page = NULL;
> +
> +	pte = pte_offset_map(pmd, addr);
> +	if (!pte)
> +		return NULL;
> +
> +	pteval = ptep_get_lockless(pte);
> +	if (pte_none(pteval) || !pte_present(pteval))
> +		goto out;
> +
> +	page = vm_normal_page(vma, addr, pteval);
> +	if (unlikely(!page) || unlikely(is_zone_device_page(page)))
> +		goto out;
> +
> +	page = compound_head(page);
> +
> +	if (!trylock_page(page)) {
> +		page = NULL;
> +		goto out;
> +	}
> +
> +	if (!get_page_unless_zero(page)) {
> +		unlock_page(page);
> +		page = NULL;
> +		goto out;
> +	}
> +
> +out:
> +	pte_unmap(pte);
> +	return page;
> +}
> +
> +static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
> +				struct vm_area_struct *vma,
> +				unsigned long haddr, bool *mmap_locked,
> +				struct collapse_control *cc)
> +{
> +	struct mmu_notifier_range range;
> +	struct page *hpage;
> +	pte_t *start_pte, *pte;
> +	pmd_t *pmd, pmdval;
> +	spinlock_t *pml, *ptl;
> +	pgtable_t pgtable;
> +	unsigned long addr;
> +	int exclusive = 0;
> +	bool writable = false;
> +	int result, i;
> +
> +	/* Fast check before locking page if already PMD-mapped */
> +	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
> +	if (result == SCAN_PMD_MAPPED)
> +		return result;
> +
> +	hpage = find_lock_pte_mapped_page(vma, haddr, pmd);
> +	if (!hpage)
> +		return SCAN_PAGE_NULL;
> +	if (!PageHead(hpage)) {
> +		result = SCAN_FAIL;
> +		goto drop_hpage;
> +	}
> +	if (compound_order(hpage) != HPAGE_PMD_ORDER) {
> +		result = SCAN_PAGE_COMPOUND;
> +		goto drop_hpage;
> +	}
> +
> +	mmap_read_unlock(mm);
> +	*mmap_locked = false;
> +
> +	/* Prevent all access to pagetables */
> +	mmap_write_lock(mm);
> +
> +	result = hugepage_vma_revalidate(mm, haddr, true, &vma, cc);
> +	if (result != SCAN_SUCCEED)
> +		goto up_write;
> +
> +	result = check_pmd_still_valid(mm, haddr, pmd);
> +	if (result != SCAN_SUCCEED)
> +		goto up_write;
> +
> +	/* Recheck with mmap write lock */
> +	result = SCAN_SUCCEED;
> +	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
> +	if (!start_pte)
> +		goto drop_hpage;
> +	for (i = 0, addr = haddr, pte = start_pte;
> +	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
> +		struct page *page;
> +		pte_t pteval = ptep_get(pte);
> +
> +		if (pte_none(pteval) || !pte_present(pteval)) {
> +			result = SCAN_PTE_NON_PRESENT;
> +			break;
> +		}
> +
> +		if (pte_uffd_wp(pteval)) {
> +			result = SCAN_PTE_UFFD_WP;
> +			break;
> +		}
> +
> +		if (pte_write(pteval))
> +			writable = true;
> +
> +		page = vm_normal_page(vma, addr, pteval);
> +
> +		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
> +			result = SCAN_PAGE_NULL;
> +			break;
> +		}
> +
> +		if (hpage + i != page) {
> +			result = SCAN_FAIL;
> +			break;
> +		}
> +
> +		if (PageAnonExclusive(page))
> +			exclusive++;
> +	}
> +	pte_unmap_unlock(start_pte, ptl);
> +	if (result != SCAN_SUCCEED)
> +		goto drop_hpage;
> +

I'm wondering if anybody can now modify the page table. Likely you also 
need the VMA write lock. The mmap write lock is no longer sufficient.

> +	/*
> +	 * Case 1:
> +	 * No subpages are PageAnonExclusive (PTEs must be R/O), we can
> +	 * collapse into a R/O PMD without further action.
> +	 */
> +	if (!(exclusive == 0 && !writable))
> +		goto drop_hpage;
> +
> +	/* Collapse pmd entry */
> +	vma_start_write(vma);
> +	anon_vma_lock_write(vma->anon_vma);
> +
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
> +				haddr, haddr + HPAGE_PMD_SIZE);
> +	mmu_notifier_invalidate_range_start(&range);
> +
> +	pml = pmd_lock(mm, pmd); /* probably unnecessary */
> +	pmdval = pmdp_collapse_flush(vma, haddr, pmd);
> +	spin_unlock(pml);
> +	mmu_notifier_invalidate_range_end(&range);
> +	tlb_remove_table_sync_one();
> +
> +	anon_vma_unlock_write(vma->anon_vma);
> +
> +	start_pte = pte_offset_map_lock(mm, &pmdval, haddr, &ptl);
> +	if (!start_pte)
> +		goto rollback;

How can that happen?

> +	for (i = 0, addr = haddr, pte = start_pte;
> +	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
> +		struct page *page;
> +		pte_t pteval = ptep_get(pte);
> +
> +		page = vm_normal_page(vma, addr, pteval);
> +		page_remove_rmap(page, vma, false);
> +	}
> +	pte_unmap_unlock(start_pte, ptl);
> +
> +	/* Install pmd entry */
> +	pgtable = pmd_pgtable(pmdval);
> +	pmdval = mk_huge_pmd(hpage, vma->vm_page_prot);
> +	spin_lock(pml);
> +	page_add_anon_rmap(hpage, vma, haddr, RMAP_COMPOUND);
> +	pgtable_trans_huge_deposit(mm, pmd, pgtable);
> +	set_pmd_at(mm, haddr, pmd, pmdval);
> +	update_mmu_cache_pmd(vma, haddr, pmd);
> +	spin_unlock(pml);

1) Looks like you are missing to update the refcount.

You should take one reference before adding the new rmap, and drop the 
other references when dropping the rmaps.

2) You should drop the old rmaps after obtaining the new rmap (+reference)

3) You should use folios in the new code.

4) You'll soon be able to use rmap batching [1] and avoid looping over
    the pages once more manually.


Based on [1], the code flow likely should look something like this:


first_page = &folio->page;
...
folio_get(folio);
folio_add_anon_rmap_pmd(folio, first_page, RMAP_NONE);
...
folio_remove_rmap_ptes(folio, first_page, HPAGE_PMD_NR, vma);
folio_ref_sub(folio, HPAGE_PMD_NR);
...


[1] https://lkml.kernel.org/r/20231204142146.91437-24-david@redhat.com
Xu Yu Dec. 18, 2023, 2:45 a.m. UTC | #3
On 12/7/23 6:37 PM, David Hildenbrand wrote:
> On 07.12.23 04:09, Xu Yu wrote:
>> In the anonymous collapse path, khugepaged always collapses pte-mapped
>> hugepage by allocating and copying to a new hugepage.
>>
>> In some scenarios, we can only update the mapping page tables for
>> anonymous pte-mapped THPs, in the same way as file/shmem-backed
>> pte-mapped THPs, as shown in commit 58ac9a8993a1 ("mm/khugepaged:
>> attempt to map file/shmem-backed pte-mapped THPs by pmds")
>>
>> The simplest scenario that satisfies the conditions, as David points out,
>> is when no subpages are PageAnonExclusive (PTEs must be R/O), we can
>> collapse into a R/O PMD without further action.
>>
>> Let's start from this simplest scenario.
>>
>> Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
>> ---
>>   mm/khugepaged.c | 214 ++++++++++++++++++++++++++++++++++++++++++++++++
>>   1 file changed, 214 insertions(+)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 88433cc25d8a..85c7a2ab44ce 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1237,6 +1237,197 @@ static int collapse_huge_page(struct mm_struct *mm, 
>> unsigned long address,
>>       return result;
>>   }
>> +static struct page *find_lock_pte_mapped_page(struct vm_area_struct *vma,
>> +                          unsigned long addr, pmd_t *pmd)
>> +{
>> +    pte_t *pte, pteval;
>> +    struct page *page = NULL;
>> +
>> +    pte = pte_offset_map(pmd, addr);
>> +    if (!pte)
>> +        return NULL;
>> +
>> +    pteval = ptep_get_lockless(pte);
>> +    if (pte_none(pteval) || !pte_present(pteval))
>> +        goto out;
>> +
>> +    page = vm_normal_page(vma, addr, pteval);
>> +    if (unlikely(!page) || unlikely(is_zone_device_page(page)))
>> +        goto out;
>> +
>> +    page = compound_head(page);
>> +
>> +    if (!trylock_page(page)) {
>> +        page = NULL;
>> +        goto out;
>> +    }
>> +
>> +    if (!get_page_unless_zero(page)) {
>> +        unlock_page(page);
>> +        page = NULL;
>> +        goto out;
>> +    }
>> +
>> +out:
>> +    pte_unmap(pte);
>> +    return page;
>> +}
>> +
>> +static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
>> +                struct vm_area_struct *vma,
>> +                unsigned long haddr, bool *mmap_locked,
>> +                struct collapse_control *cc)
>> +{
>> +    struct mmu_notifier_range range;
>> +    struct page *hpage;
>> +    pte_t *start_pte, *pte;
>> +    pmd_t *pmd, pmdval;
>> +    spinlock_t *pml, *ptl;
>> +    pgtable_t pgtable;
>> +    unsigned long addr;
>> +    int exclusive = 0;
>> +    bool writable = false;
>> +    int result, i;
>> +
>> +    /* Fast check before locking page if already PMD-mapped */
>> +    result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
>> +    if (result == SCAN_PMD_MAPPED)
>> +        return result;
>> +
>> +    hpage = find_lock_pte_mapped_page(vma, haddr, pmd);
>> +    if (!hpage)
>> +        return SCAN_PAGE_NULL;
>> +    if (!PageHead(hpage)) {
>> +        result = SCAN_FAIL;
>> +        goto drop_hpage;
>> +    }
>> +    if (compound_order(hpage) != HPAGE_PMD_ORDER) {
>> +        result = SCAN_PAGE_COMPOUND;
>> +        goto drop_hpage;
>> +    }
>> +
>> +    mmap_read_unlock(mm);
>> +    *mmap_locked = false;
>> +
>> +    /* Prevent all access to pagetables */
>> +    mmap_write_lock(mm);
>> +
>> +    result = hugepage_vma_revalidate(mm, haddr, true, &vma, cc);
>> +    if (result != SCAN_SUCCEED)
>> +        goto up_write;
>> +
>> +    result = check_pmd_still_valid(mm, haddr, pmd);
>> +    if (result != SCAN_SUCCEED)
>> +        goto up_write;
>> +
>> +    /* Recheck with mmap write lock */
>> +    result = SCAN_SUCCEED;
>> +    start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
>> +    if (!start_pte)
>> +        goto drop_hpage;
>> +    for (i = 0, addr = haddr, pte = start_pte;
>> +         i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
>> +        struct page *page;
>> +        pte_t pteval = ptep_get(pte);
>> +
>> +        if (pte_none(pteval) || !pte_present(pteval)) {
>> +            result = SCAN_PTE_NON_PRESENT;
>> +            break;
>> +        }
>> +
>> +        if (pte_uffd_wp(pteval)) {
>> +            result = SCAN_PTE_UFFD_WP;
>> +            break;
>> +        }
>> +
>> +        if (pte_write(pteval))
>> +            writable = true;
>> +
>> +        page = vm_normal_page(vma, addr, pteval);
>> +
>> +        if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
>> +            result = SCAN_PAGE_NULL;
>> +            break;
>> +        }
>> +
>> +        if (hpage + i != page) {
>> +            result = SCAN_FAIL;
>> +            break;
>> +        }
>> +
>> +        if (PageAnonExclusive(page))
>> +            exclusive++;
>> +    }
>> +    pte_unmap_unlock(start_pte, ptl);
>> +    if (result != SCAN_SUCCEED)
>> +        goto drop_hpage;
>> +
> 
> I'm wondering if anybody can now modify the page table. Likely you also need the 
> VMA write lock. The mmap write lock is no longer sufficient.
> 
>> +    /*
>> +     * Case 1:
>> +     * No subpages are PageAnonExclusive (PTEs must be R/O), we can
>> +     * collapse into a R/O PMD without further action.
>> +     */
>> +    if (!(exclusive == 0 && !writable))
>> +        goto drop_hpage;
>> +
>> +    /* Collapse pmd entry */
>> +    vma_start_write(vma);

Will move vma_start_write() to the front, just after mmap write lock.

>> +    anon_vma_lock_write(vma->anon_vma);
>> +
>> +    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
>> +                haddr, haddr + HPAGE_PMD_SIZE);
>> +    mmu_notifier_invalidate_range_start(&range);
>> +
>> +    pml = pmd_lock(mm, pmd); /* probably unnecessary */
>> +    pmdval = pmdp_collapse_flush(vma, haddr, pmd);
>> +    spin_unlock(pml);
>> +    mmu_notifier_invalidate_range_end(&range);
>> +    tlb_remove_table_sync_one();
>> +
>> +    anon_vma_unlock_write(vma->anon_vma);
>> +
>> +    start_pte = pte_offset_map_lock(mm, &pmdval, haddr, &ptl);
>> +    if (!start_pte)
>> +        goto rollback;
> 
> How can that happen?

I referred to the abort logic in collapse_pte_mapped_thp(), I'm not sure if
anyone can modify the page table at the same time, so I added a redundant check.

If mmap write lock + vma write lock + page lock is sufficient, I should delete
this check.

> 
>> +    for (i = 0, addr = haddr, pte = start_pte;
>> +         i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
>> +        struct page *page;
>> +        pte_t pteval = ptep_get(pte);
>> +
>> +        page = vm_normal_page(vma, addr, pteval);
>> +        page_remove_rmap(page, vma, false);
>> +    }
>> +    pte_unmap_unlock(start_pte, ptl);
>> +
>> +    /* Install pmd entry */
>> +    pgtable = pmd_pgtable(pmdval);
>> +    pmdval = mk_huge_pmd(hpage, vma->vm_page_prot);
>> +    spin_lock(pml);
>> +    page_add_anon_rmap(hpage, vma, haddr, RMAP_COMPOUND);
>> +    pgtable_trans_huge_deposit(mm, pmd, pgtable);
>> +    set_pmd_at(mm, haddr, pmd, pmdval);
>> +    update_mmu_cache_pmd(vma, haddr, pmd);
>> +    spin_unlock(pml);
> 
> 1) Looks like you are missing to update the refcount.
> 
> You should take one reference before adding the new rmap, and drop the other 
> references when dropping the rmaps.
> 
> 2) You should drop the old rmaps after obtaining the new rmap (+reference)
> 
> 3) You should use folios in the new code.

Many thanks! Will do in v3.

> 
> 4) You'll soon be able to use rmap batching [1] and avoid looping over
>     the pages once more manually.
> 
> 
> Based on [1], the code flow likely should look something like this:

Thanks again! I notice that [1] is in the patchset ("mm/rmap: interface
overhaul") which is under review, should I wait for this patchset to be
merged into mm-unstable or mm-stable? Or can I send out v3 without this
helper first?

If you don't mind, I will send out v3 first.

> 
> 
> first_page = &folio->page;
> ...
> folio_get(folio);
> folio_add_anon_rmap_pmd(folio, first_page, RMAP_NONE);
> ...
> folio_remove_rmap_ptes(folio, first_page, HPAGE_PMD_NR, vma);
> folio_ref_sub(folio, HPAGE_PMD_NR);
> ...
> 
> 
> [1] https://lkml.kernel.org/r/20231204142146.91437-24-david@redhat.com
>
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 88433cc25d8a..85c7a2ab44ce 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1237,6 +1237,197 @@  static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	return result;
 }
 
+static struct page *find_lock_pte_mapped_page(struct vm_area_struct *vma,
+					      unsigned long addr, pmd_t *pmd)
+{
+	pte_t *pte, pteval;
+	struct page *page = NULL;
+
+	pte = pte_offset_map(pmd, addr);
+	if (!pte)
+		return NULL;
+
+	pteval = ptep_get_lockless(pte);
+	if (pte_none(pteval) || !pte_present(pteval))
+		goto out;
+
+	page = vm_normal_page(vma, addr, pteval);
+	if (unlikely(!page) || unlikely(is_zone_device_page(page)))
+		goto out;
+
+	page = compound_head(page);
+
+	if (!trylock_page(page)) {
+		page = NULL;
+		goto out;
+	}
+
+	if (!get_page_unless_zero(page)) {
+		unlock_page(page);
+		page = NULL;
+		goto out;
+	}
+
+out:
+	pte_unmap(pte);
+	return page;
+}
+
+static int collapse_pte_mapped_anon_thp(struct mm_struct *mm,
+				struct vm_area_struct *vma,
+				unsigned long haddr, bool *mmap_locked,
+				struct collapse_control *cc)
+{
+	struct mmu_notifier_range range;
+	struct page *hpage;
+	pte_t *start_pte, *pte;
+	pmd_t *pmd, pmdval;
+	spinlock_t *pml, *ptl;
+	pgtable_t pgtable;
+	unsigned long addr;
+	int exclusive = 0;
+	bool writable = false;
+	int result, i;
+
+	/* Fast check before locking page if already PMD-mapped */
+	result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
+	if (result == SCAN_PMD_MAPPED)
+		return result;
+
+	hpage = find_lock_pte_mapped_page(vma, haddr, pmd);
+	if (!hpage)
+		return SCAN_PAGE_NULL;
+	if (!PageHead(hpage)) {
+		result = SCAN_FAIL;
+		goto drop_hpage;
+	}
+	if (compound_order(hpage) != HPAGE_PMD_ORDER) {
+		result = SCAN_PAGE_COMPOUND;
+		goto drop_hpage;
+	}
+
+	mmap_read_unlock(mm);
+	*mmap_locked = false;
+
+	/* Prevent all access to pagetables */
+	mmap_write_lock(mm);
+
+	result = hugepage_vma_revalidate(mm, haddr, true, &vma, cc);
+	if (result != SCAN_SUCCEED)
+		goto up_write;
+
+	result = check_pmd_still_valid(mm, haddr, pmd);
+	if (result != SCAN_SUCCEED)
+		goto up_write;
+
+	/* Recheck with mmap write lock */
+	result = SCAN_SUCCEED;
+	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+	if (!start_pte)
+		goto drop_hpage;
+	for (i = 0, addr = haddr, pte = start_pte;
+	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+		struct page *page;
+		pte_t pteval = ptep_get(pte);
+
+		if (pte_none(pteval) || !pte_present(pteval)) {
+			result = SCAN_PTE_NON_PRESENT;
+			break;
+		}
+
+		if (pte_uffd_wp(pteval)) {
+			result = SCAN_PTE_UFFD_WP;
+			break;
+		}
+
+		if (pte_write(pteval))
+			writable = true;
+
+		page = vm_normal_page(vma, addr, pteval);
+
+		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
+			result = SCAN_PAGE_NULL;
+			break;
+		}
+
+		if (hpage + i != page) {
+			result = SCAN_FAIL;
+			break;
+		}
+
+		if (PageAnonExclusive(page))
+			exclusive++;
+	}
+	pte_unmap_unlock(start_pte, ptl);
+	if (result != SCAN_SUCCEED)
+		goto drop_hpage;
+
+	/*
+	 * Case 1:
+	 * No subpages are PageAnonExclusive (PTEs must be R/O), we can
+	 * collapse into a R/O PMD without further action.
+	 */
+	if (!(exclusive == 0 && !writable))
+		goto drop_hpage;
+
+	/* Collapse pmd entry */
+	vma_start_write(vma);
+	anon_vma_lock_write(vma->anon_vma);
+
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
+				haddr, haddr + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+
+	pml = pmd_lock(mm, pmd); /* probably unnecessary */
+	pmdval = pmdp_collapse_flush(vma, haddr, pmd);
+	spin_unlock(pml);
+	mmu_notifier_invalidate_range_end(&range);
+	tlb_remove_table_sync_one();
+
+	anon_vma_unlock_write(vma->anon_vma);
+
+	start_pte = pte_offset_map_lock(mm, &pmdval, haddr, &ptl);
+	if (!start_pte)
+		goto rollback;
+	for (i = 0, addr = haddr, pte = start_pte;
+	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+		struct page *page;
+		pte_t pteval = ptep_get(pte);
+
+		page = vm_normal_page(vma, addr, pteval);
+		page_remove_rmap(page, vma, false);
+	}
+	pte_unmap_unlock(start_pte, ptl);
+
+	/* Install pmd entry */
+	pgtable = pmd_pgtable(pmdval);
+	pmdval = mk_huge_pmd(hpage, vma->vm_page_prot);
+	spin_lock(pml);
+	page_add_anon_rmap(hpage, vma, haddr, RMAP_COMPOUND);
+	pgtable_trans_huge_deposit(mm, pmd, pgtable);
+	set_pmd_at(mm, haddr, pmd, pmdval);
+	update_mmu_cache_pmd(vma, haddr, pmd);
+	spin_unlock(pml);
+
+	result = SCAN_SUCCEED;
+	goto up_write;
+
+rollback:
+	spin_lock(pml);
+	pmd_populate(mm, pmd, pmd_pgtable(pmdval));
+	spin_unlock(pml);
+
+up_write:
+	mmap_write_unlock(mm);
+
+drop_hpage:
+	unlock_page(hpage);
+	put_page(hpage);
+
+	/* TODO: tracepoints */
+	return result;
+}
+
 static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 				   struct vm_area_struct *vma,
 				   unsigned long address, bool *mmap_locked,
@@ -1251,6 +1442,8 @@  static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	spinlock_t *ptl;
 	int node = NUMA_NO_NODE, unmapped = 0;
 	bool writable = false;
+	int exclusive = 0;
+	bool is_hpage = false;
 
 	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
 
@@ -1333,8 +1526,14 @@  static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 			}
 		}
 
+		if (PageAnonExclusive(page))
+			exclusive++;
+
 		page = compound_head(page);
 
+		if (compound_order(page) == HPAGE_PMD_ORDER)
+			is_hpage = true;
+
 		/*
 		 * Record which node the original page is from and save this
 		 * information to cc->node_load[].
@@ -1396,7 +1595,22 @@  static int hpage_collapse_scan_pmd(struct mm_struct *mm,
 	}
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
+
+	if (is_hpage && (exclusive == 0 && !writable)) {
+		int res;
+
+		res = collapse_pte_mapped_anon_thp(mm, vma, address,
+						   mmap_locked, cc);
+		if (res == SCAN_PMD_MAPPED || res == SCAN_SUCCEED) {
+			result = res;
+			goto out;
+		}
+
+	}
+
 	if (result == SCAN_SUCCEED) {
+		if (!*mmap_locked)
+			mmap_read_lock(mm);
 		result = collapse_huge_page(mm, address, referenced,
 					    unmapped, cc);
 		/* collapse_huge_page will return with the mmap_lock released */