[v3,1/2] mm: Abstract THP allocation

Message ID	20240911065600.1002644-2-dev.jain@arm.com (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> From: Dev Jain <dev.jain@arm.com> To: akpm@linux-foundation.org, david@redhat.com, willy@infradead.org, kirill.shutemov@linux.intel.com Cc: ryan.roberts@arm.com, anshuman.khandual@arm.com, catalin.marinas@arm.com, cl@gentwo.org, vbabka@suse.cz, mhocko@suse.com, apopple@nvidia.com, dave.hansen@linux.intel.com, will@kernel.org, baohua@kernel.org, jack@suse.cz, mark.rutland@arm.com, hughd@google.com, aneesh.kumar@kernel.org, yang@os.amperecomputing.com, peterx@redhat.com, ioworker0@gmail.com, jglisse@google.com, wangkefeng.wang@huawei.com, ziy@nvidia.com, linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, Dev Jain <dev.jain@arm.com> Subject: [PATCH v3 1/2] mm: Abstract THP allocation Date: Wed, 11 Sep 2024 12:25:59 +0530 Message-Id: <20240911065600.1002644-2-dev.jain@arm.com> In-Reply-To: <20240911065600.1002644-1-dev.jain@arm.com> References: <20240911065600.1002644-1-dev.jain@arm.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	Do not shatter hugezeropage on wp-fault \| expand [v3,0/2] Do not shatter hugezeropage on wp-fault [v3,1/2] mm: Abstract THP allocation [v3,2/2] mm: Allocate THP on hugezeropage wp-fault

Dev Jain Sept. 11, 2024, 6:55 a.m. UTC

In preparation for the second patch, abstract away the THP allocation
logic present in the create_huge_pmd() path, which corresponds to the
faulting case when no page is present.

There should be no functional change as a result of applying
this patch.

Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------
 1 file changed, 67 insertions(+), 43 deletions(-)

David Hildenbrand Sept. 11, 2024, 9:27 a.m. UTC | #1

On 11.09.24 08:55, Dev Jain wrote:
> In preparation for the second patch, abstract away the THP allocation
> logic present in the create_huge_pmd() path, which corresponds to the
> faulting case when no page is present.
> 
> There should be no functional change as a result of applying
> this patch.
> 

Hi,

> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>   mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------
>   1 file changed, 67 insertions(+), 43 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 67c86a5d64a6..b96a1ff2bf40 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -943,47 +943,88 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>   }
>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>   
> -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
> -			struct page *page, gfp_t gfp)
> +static struct folio *pmd_thp_fault_alloc(gfp_t gfp, struct vm_area_struct *vma,
> +					 unsigned long haddr, unsigned long addr)

I suggest calling this something like "vma_alloc_anon_folio_pmd()"? Then 
it's more consistent with vma_alloc_folio().

Also, likely we should just only pass in "addr" and calculate "haddr" 
ourselves, it's cheap and reduces the number of function parameters.

>   {
> -	struct vm_area_struct *vma = vmf->vma;
> -	struct folio *folio = page_folio(page);
> -	pgtable_t pgtable;
> -	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> -	vm_fault_t ret = 0;
> +	const int order = HPAGE_PMD_ORDER;
> +	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>   
> -	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
> +	if (unlikely(!folio)) {
> +		count_vm_event(THP_FAULT_FALLBACK);
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> +		goto out;
> +	}
>   
> +	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>   	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>   		folio_put(folio);
>   		count_vm_event(THP_FAULT_FALLBACK);
>   		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> -		return VM_FAULT_FALLBACK;
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> +		goto out;
>   	}
>   	folio_throttle_swaprate(folio, gfp);
>   
> -	pgtable = pte_alloc_one(vma->vm_mm);
> -	if (unlikely(!pgtable)) {
> -		ret = VM_FAULT_OOM;
> -		goto release;
> -	}
> -
> -	folio_zero_user(folio, vmf->address);
> +	folio_zero_user(folio, addr);
>   	/*
>   	 * The memory barrier inside __folio_mark_uptodate makes sure that
>   	 * folio_zero_user writes become visible before the set_pmd_at()
>   	 * write.
>   	 */
>   	__folio_mark_uptodate(folio);
> +out:
> +	return folio;
> +}
> +
> +static void __pmd_thp_fault_success_stats(struct vm_area_struct *vma)
> +{
> +	count_vm_event(THP_FAULT_ALLOC);
> +	count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
> +	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
> +}

Why isn't that moved into map_pmd_thp()

Note that in this patch you do:

map_pmd_thp(folio, vmf, vma, haddr);
spin_unlock(vmf->ptl);
__pmd_thp_fault_success_stats(vma);

But in patch #2

map_pmd_thp(folio, vmf, vma, haddr);
__pmd_thp_fault_success_stats(vma);
goto unlock;
release:
	folio_put(folio);
unlock:
	spin_unlock(vmf->ptl);

Please make that consistent, meaning:

1) Inline __pmd_thp_fault_success_stats() into map_pmd_thp(). No need to 
have the separated out.

2) Either do the PTL unlocking in __pmd_thp_fault_success_stats() or in
    the caller. In the caller is likely easiest. Adjusting the counters
    should be cheap, if not we could revisit this later with real data.

> +
> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
> +			struct vm_area_struct *vma, unsigned long haddr)
> +{
> +	pmd_t entry;
> +
> +	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
> +	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> +	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
> +	folio_add_lru_vma(folio, vma);
> +	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
> +	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);

It's quite weird to see a mixture of haddr and vmf->address, and likely 
this mixture is wrong or not not required.

Looking at arc's update_mmu_cache_pmd() implementation, I cannot see how 
passing in the unaligned address would do the right thing. But maybe arc 
also doesn't trigger that code path ... who knows :)


Staring at some other update_mmu_cache_pmd() users, it's quite 
inconsistent. Primarily only do_huge_pmd_numa_page() and 
__do_huge_pmd_anonymous_page() use the unaligned address. The others 
seem to use the aligned address ... as one would expect when modifying a 
PMD.


I suggest to change this function to *not* pass in the vmf, and rename 
it to something like:

static void folio_map_anon_pmd(struct folio *folio, struct 
vm_area_struct *vma, pmd_t *pmd, unsigned long haddr)

Then use haddr also to do the update_mmu_cache_pmd().

> +	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> +	mm_inc_nr_ptes(vma->vm_mm);
> +}
> +
> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct folio *folio;
> +	pgtable_t pgtable;
> +	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> +	vm_fault_t ret = 0;
> +	gfp_t gfp = vma_thp_gfp_mask(vma);

Nit: While at it, try to use reverse christmas-tree where possible, 
makes things more reasible. You could make haddr const.

struct vm_area_struct *vma = vmf->vma;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
gfp_t gfp = vma_thp_gfp_mask(vma);
struct folio *folio;
vm_fault_t ret = 0;
...

> +
> +	folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address);
> +	if (unlikely(!folio)) {
> +		ret = VM_FAULT_FALLBACK;
> +		goto release;
> +	}
> +
> +	pgtable = pte_alloc_one(vma->vm_mm);
> +	if (unlikely(!pgtable)) {
> +		ret = VM_FAULT_OOM;
> +		goto release;
> +	}
>   
>   	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> +

Nit Unrelated change.

>   	if (unlikely(!pmd_none(*vmf->pmd))) {
>   		goto unlock_release;

David Hildenbrand Sept. 11, 2024, 9:29 a.m. UTC | #2

On 11.09.24 11:27, David Hildenbrand wrote:
> On 11.09.24 08:55, Dev Jain wrote:
>> In preparation for the second patch, abstract away the THP allocation
>> logic present in the create_huge_pmd() path, which corresponds to the
>> faulting case when no page is present.
>>
>> There should be no functional change as a result of applying
>> this patch.
>>
> 
> Hi,
> 
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>    mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------
>>    1 file changed, 67 insertions(+), 43 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 67c86a5d64a6..b96a1ff2bf40 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -943,47 +943,88 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>>    }
>>    EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>    
>> -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>> -			struct page *page, gfp_t gfp)
>> +static struct folio *pmd_thp_fault_alloc(gfp_t gfp, struct vm_area_struct *vma,
>> +					 unsigned long haddr, unsigned long addr)
> 
> I suggest calling this something like "vma_alloc_anon_folio_pmd()"? Then
> it's more consistent with vma_alloc_folio().
> 
> Also, likely we should just only pass in "addr" and calculate "haddr"
> ourselves, it's cheap and reduces the number of function parameters.
> 
>>    {
>> -	struct vm_area_struct *vma = vmf->vma;
>> -	struct folio *folio = page_folio(page);
>> -	pgtable_t pgtable;
>> -	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>> -	vm_fault_t ret = 0;
>> +	const int order = HPAGE_PMD_ORDER;
>> +	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>    
>> -	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>> +	if (unlikely(!folio)) {
>> +		count_vm_event(THP_FAULT_FALLBACK);
>> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>> +		goto out;
>> +	}
>>    
>> +	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>    	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>>    		folio_put(folio);
>>    		count_vm_event(THP_FAULT_FALLBACK);
>>    		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
>> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
>> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>> -		return VM_FAULT_FALLBACK;
>> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>> +		goto out;
>>    	}
>>    	folio_throttle_swaprate(folio, gfp);
>>    
>> -	pgtable = pte_alloc_one(vma->vm_mm);
>> -	if (unlikely(!pgtable)) {
>> -		ret = VM_FAULT_OOM;
>> -		goto release;
>> -	}
>> -
>> -	folio_zero_user(folio, vmf->address);
>> +	folio_zero_user(folio, addr);
>>    	/*
>>    	 * The memory barrier inside __folio_mark_uptodate makes sure that
>>    	 * folio_zero_user writes become visible before the set_pmd_at()
>>    	 * write.
>>    	 */
>>    	__folio_mark_uptodate(folio);
>> +out:
>> +	return folio;
>> +}
>> +
>> +static void __pmd_thp_fault_success_stats(struct vm_area_struct *vma)
>> +{
>> +	count_vm_event(THP_FAULT_ALLOC);
>> +	count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
>> +	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>> +}
> 
> Why isn't that moved into map_pmd_thp()
> 
> Note that in this patch you do:
> 
> map_pmd_thp(folio, vmf, vma, haddr);
> spin_unlock(vmf->ptl);
> __pmd_thp_fault_success_stats(vma);
> 
> But in patch #2
> 
> map_pmd_thp(folio, vmf, vma, haddr);
> __pmd_thp_fault_success_stats(vma);
> goto unlock;
> release:
> 	folio_put(folio);
> unlock:
> 	spin_unlock(vmf->ptl);
> 
> Please make that consistent, meaning:
> 
> 1) Inline __pmd_thp_fault_success_stats() into map_pmd_thp(). No need to
> have the separated out.
> 
> 2) Either do the PTL unlocking in __pmd_thp_fault_success_stats() or in
>      the caller. In the caller is likely easiest. Adjusting the counters
>      should be cheap, if not we could revisit this later with real data.
> 
>> +
>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +			struct vm_area_struct *vma, unsigned long haddr)
>> +{
>> +	pmd_t entry;
>> +
>> +	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>> +	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>> +	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>> +	folio_add_lru_vma(folio, vma);
>> +	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>> +	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> 
> It's quite weird to see a mixture of haddr and vmf->address, and likely
> this mixture is wrong or not not required.
> 
> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see how
> passing in the unaligned address would do the right thing. But maybe arc
> also doesn't trigger that code path ... who knows :)
> 
> 
> Staring at some other update_mmu_cache_pmd() users, it's quite
> inconsistent. Primarily only do_huge_pmd_numa_page() and
> __do_huge_pmd_anonymous_page() use the unaligned address. The others
> seem to use the aligned address ... as one would expect when modifying a
> PMD.
> 
> 
> I suggest to change this function to *not* pass in the vmf, and rename
> it to something like:
> 
> static void folio_map_anon_pmd(struct folio *folio, struct
> vm_area_struct *vma, pmd_t *pmd, unsigned long haddr)

... or better "map_anon_folio_pmd" so it better matches 
vma_alloc_folio_pmd() suggested above.

Could also do

vma_map_anon_folio_pmd() :)

Kefeng Wang Sept. 11, 2024, 10:52 a.m. UTC | #3

On 2024/9/11 14:55, Dev Jain wrote:
> In preparation for the second patch, abstract away the THP allocation
> logic present in the create_huge_pmd() path, which corresponds to the
> faulting case when no page is present.
> 
> There should be no functional change as a result of applying
> this patch.
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>   mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------
>   1 file changed, 67 insertions(+), 43 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 67c86a5d64a6..b96a1ff2bf40 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -943,47 +943,88 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>   }
>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>   
> -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
> -			struct page *page, gfp_t gfp)
> +static struct folio *pmd_thp_fault_alloc(gfp_t gfp, struct vm_area_struct *vma,
> +					 unsigned long haddr, unsigned long addr)
>   {
> -	struct vm_area_struct *vma = vmf->vma;
> -	struct folio *folio = page_folio(page);
> -	pgtable_t pgtable;
> -	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> -	vm_fault_t ret = 0;
> +	const int order = HPAGE_PMD_ORDER;

Maybe move vma_thp_gfp_mask() into this function too.

> +	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>   
> -	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
> +	if (unlikely(!folio)) {
> +		count_vm_event(THP_FAULT_FALLBACK);
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> +		goto out;
> +	}
>   
> +	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>   	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>   		folio_put(folio);
>   		count_vm_event(THP_FAULT_FALLBACK);
>   		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> -		return VM_FAULT_FALLBACK;
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> +		goto out;

We need to return NULL here as folio not set to null,

>   	}
>   	folio_throttle_swaprate(folio, gfp);
>   
> -	pgtable = pte_alloc_one(vma->vm_mm);
> -	if (unlikely(!pgtable)) {
> -		ret = VM_FAULT_OOM;
> -		goto release;
> -	}
> -
> -	folio_zero_user(folio, vmf->address);
> +	folio_zero_user(folio, addr);
>   	/*
>   	 * The memory barrier inside __folio_mark_uptodate makes sure that
>   	 * folio_zero_user writes become visible before the set_pmd_at()
>   	 * write.
>   	 */
>   	__folio_mark_uptodate(folio);
> +out:
> +	return folio;
> +}
> +

Dev Jain Sept. 11, 2024, noon UTC | #4

On 9/11/24 14:57, David Hildenbrand wrote:
> On 11.09.24 08:55, Dev Jain wrote:
>> In preparation for the second patch, abstract away the THP allocation
>> logic present in the create_huge_pmd() path, which corresponds to the
>> faulting case when no page is present.
>>
>> There should be no functional change as a result of applying
>> this patch.
>>
>
> Hi,
>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------
>>   1 file changed, 67 insertions(+), 43 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 67c86a5d64a6..b96a1ff2bf40 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -943,47 +943,88 @@ unsigned long thp_get_unmapped_area(struct file 
>> *filp, unsigned long addr,
>>   }
>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>   -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>> -            struct page *page, gfp_t gfp)
>> +static struct folio *pmd_thp_fault_alloc(gfp_t gfp, struct 
>> vm_area_struct *vma,
>> +                     unsigned long haddr, unsigned long addr)
>
> I suggest calling this something like "vma_alloc_anon_folio_pmd()"? 
> Then it's more consistent with vma_alloc_folio().
>
> Also, likely we should just only pass in "addr" and calculate "haddr" 
> ourselves, it's cheap and reduces the number of function parameters.

Makes sense, thanks.
>
>>   {
>> -    struct vm_area_struct *vma = vmf->vma;
>> -    struct folio *folio = page_folio(page);
>> -    pgtable_t pgtable;
>> -    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>> -    vm_fault_t ret = 0;
>> +    const int order = HPAGE_PMD_ORDER;
>> +    struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, 
>> true);
>>   -    VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>> +    if (unlikely(!folio)) {
>> +        count_vm_event(THP_FAULT_FALLBACK);
>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>> +        goto out;
>> +    }
>>   +    VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>       if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>>           folio_put(folio);
>>           count_vm_event(THP_FAULT_FALLBACK);
>>           count_vm_event(THP_FAULT_FALLBACK_CHARGE);
>> -        count_mthp_stat(HPAGE_PMD_ORDER, 
>> MTHP_STAT_ANON_FAULT_FALLBACK);
>> -        count_mthp_stat(HPAGE_PMD_ORDER, 
>> MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>> -        return VM_FAULT_FALLBACK;
>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>> +        goto out;
>>       }
>>       folio_throttle_swaprate(folio, gfp);
>>   -    pgtable = pte_alloc_one(vma->vm_mm);
>> -    if (unlikely(!pgtable)) {
>> -        ret = VM_FAULT_OOM;
>> -        goto release;
>> -    }
>> -
>> -    folio_zero_user(folio, vmf->address);
>> +    folio_zero_user(folio, addr);
>>       /*
>>        * The memory barrier inside __folio_mark_uptodate makes sure that
>>        * folio_zero_user writes become visible before the set_pmd_at()
>>        * write.
>>        */
>>       __folio_mark_uptodate(folio);
>> +out:
>> +    return folio;
>> +}
>> +
>> +static void __pmd_thp_fault_success_stats(struct vm_area_struct *vma)
>> +{
>> +    count_vm_event(THP_FAULT_ALLOC);
>> +    count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
>> +    count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>> +}
>
> Why isn't that moved into map_pmd_thp()
>
> Note that in this patch you do:
>
> map_pmd_thp(folio, vmf, vma, haddr);
> spin_unlock(vmf->ptl);
> __pmd_thp_fault_success_stats(vma);
>
> But in patch #2
>
> map_pmd_thp(folio, vmf, vma, haddr);
> __pmd_thp_fault_success_stats(vma);
> goto unlock;
> release:
>     folio_put(folio);
> unlock:
>     spin_unlock(vmf->ptl);

Yes, while writing it I knew about this inconsistency, but I wanted
to reduce latency by dropping the lock before. But in do_huge_zero_wp_pmd(),
I couldn't figure out a way to call the stat function after dropping the 
lock,
without I guess, introducing too many labels and goto jumps and the 
like. In the
current code, the lock gets dropped first.

>
> Please make that consistent, meaning:
>
> 1) Inline __pmd_thp_fault_success_stats() into map_pmd_thp(). No need 
> to have the separated out.
>
> 2) Either do the PTL unlocking in __pmd_thp_fault_success_stats() or in
>    the caller. In the caller is likely easiest. Adjusting the counters
>    should be cheap, if not we could revisit this later with real data.

I will then call it in map_pmd_thp(), that is cleaner...I did find 
occurrences
of these stat computations after taking the lock, for example, in 
do_swap_page():
count_vm_event(PGMAJFAULT)
so I guess it should be alright.
>
>> +
>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +            struct vm_area_struct *vma, unsigned long haddr)
>> +{
>> +    pmd_t entry;
>> +
>> +    entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>> +    entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>> +    folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>> +    folio_add_lru_vma(folio, vma);
>> +    set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>> +    update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>
> It's quite weird to see a mixture of haddr and vmf->address, and 
> likely this mixture is wrong or not not required.
>
> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see 
> how passing in the unaligned address would do the right thing. But 
> maybe arc also doesn't trigger that code path ... who knows :)
>
>
> Staring at some other update_mmu_cache_pmd() users, it's quite 
> inconsistent. Primarily only do_huge_pmd_numa_page() and 
> __do_huge_pmd_anonymous_page() use the unaligned address. The others 
> seem to use the aligned address ... as one would expect when modifying 
> a PMD.
>
>
> I suggest to change this function to *not* pass in the vmf, and rename 
> it to something like:
>
> static void folio_map_anon_pmd(struct folio *folio, struct 
> vm_area_struct *vma, pmd_t *pmd, unsigned long haddr)
>
> Then use haddr also to do the update_mmu_cache_pmd().

The code I changed, already was passing vmf->address to 
update_mmu_cache_pmd().
I did not change any of the haddr and vmf->address semantics, so really 
can't comment
on this.
I agree with the name change; vmf will be required for 
set_pmd_at(vmf->pmd), so I should
just pass pmd?
>
>> +    add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>> +    mm_inc_nr_ptes(vma->vm_mm);
>> +}
>> +
>> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>> +{
>> +    struct vm_area_struct *vma = vmf->vma;
>> +    struct folio *folio;
>> +    pgtable_t pgtable;
>> +    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>> +    vm_fault_t ret = 0;
>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>
> Nit: While at it, try to use reverse christmas-tree where possible, 
> makes things more reasible. You could make haddr const.
>
> struct vm_area_struct *vma = vmf->vma;
> unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> gfp_t gfp = vma_thp_gfp_mask(vma);
> struct folio *folio;
> vm_fault_t ret = 0;

Okay.
> ...
>
>> +
>> +    folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address);
>> +    if (unlikely(!folio)) {
>> +        ret = VM_FAULT_FALLBACK;
>> +        goto release;
>> +    }
>> +
>> +    pgtable = pte_alloc_one(vma->vm_mm);
>> +    if (unlikely(!pgtable)) {
>> +        ret = VM_FAULT_OOM;
>> +        goto release;
>> +    }
>>         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>> +
>
> Nit Unrelated change.

Are you asking me to align this line with the below line?
>
>>       if (unlikely(!pmd_none(*vmf->pmd))) {
>>           goto unlock_release;
>
>

Dev Jain Sept. 11, 2024, 12:02 p.m. UTC | #5

On 9/11/24 14:59, David Hildenbrand wrote:
> On 11.09.24 11:27, David Hildenbrand wrote:
>> On 11.09.24 08:55, Dev Jain wrote:
>>> In preparation for the second patch, abstract away the THP allocation
>>> logic present in the create_huge_pmd() path, which corresponds to the
>>> faulting case when no page is present.
>>>
>>> There should be no functional change as a result of applying
>>> this patch.
>>>
>>
>> Hi,
>>
>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>> ---
>>>    mm/huge_memory.c | 110 
>>> +++++++++++++++++++++++++++++------------------
>>>    1 file changed, 67 insertions(+), 43 deletions(-)
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 67c86a5d64a6..b96a1ff2bf40 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -943,47 +943,88 @@ unsigned long thp_get_unmapped_area(struct 
>>> file *filp, unsigned long addr,
>>>    }
>>>    EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>>    -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault 
>>> *vmf,
>>> -            struct page *page, gfp_t gfp)
>>> +static struct folio *pmd_thp_fault_alloc(gfp_t gfp, struct 
>>> vm_area_struct *vma,
>>> +                     unsigned long haddr, unsigned long addr)
>>
>> I suggest calling this something like "vma_alloc_anon_folio_pmd()"? Then
>> it's more consistent with vma_alloc_folio().
>>
>> Also, likely we should just only pass in "addr" and calculate "haddr"
>> ourselves, it's cheap and reduces the number of function parameters.
>>
>>>    {
>>> -    struct vm_area_struct *vma = vmf->vma;
>>> -    struct folio *folio = page_folio(page);
>>> -    pgtable_t pgtable;
>>> -    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>> -    vm_fault_t ret = 0;
>>> +    const int order = HPAGE_PMD_ORDER;
>>> +    struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, 
>>> true);
>>>    -    VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>> +    if (unlikely(!folio)) {
>>> +        count_vm_event(THP_FAULT_FALLBACK);
>>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>>> +        goto out;
>>> +    }
>>>    +    VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>>        if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>>>            folio_put(folio);
>>>            count_vm_event(THP_FAULT_FALLBACK);
>>>            count_vm_event(THP_FAULT_FALLBACK_CHARGE);
>>> -        count_mthp_stat(HPAGE_PMD_ORDER, 
>>> MTHP_STAT_ANON_FAULT_FALLBACK);
>>> -        count_mthp_stat(HPAGE_PMD_ORDER, 
>>> MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>> -        return VM_FAULT_FALLBACK;
>>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>>> +        goto out;
>>>        }
>>>        folio_throttle_swaprate(folio, gfp);
>>>    -    pgtable = pte_alloc_one(vma->vm_mm);
>>> -    if (unlikely(!pgtable)) {
>>> -        ret = VM_FAULT_OOM;
>>> -        goto release;
>>> -    }
>>> -
>>> -    folio_zero_user(folio, vmf->address);
>>> +    folio_zero_user(folio, addr);
>>>        /*
>>>         * The memory barrier inside __folio_mark_uptodate makes sure 
>>> that
>>>         * folio_zero_user writes become visible before the set_pmd_at()
>>>         * write.
>>>         */
>>>        __folio_mark_uptodate(folio);
>>> +out:
>>> +    return folio;
>>> +}
>>> +
>>> +static void __pmd_thp_fault_success_stats(struct vm_area_struct *vma)
>>> +{
>>> +    count_vm_event(THP_FAULT_ALLOC);
>>> +    count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
>>> +    count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>> +}
>>
>> Why isn't that moved into map_pmd_thp()
>>
>> Note that in this patch you do:
>>
>> map_pmd_thp(folio, vmf, vma, haddr);
>> spin_unlock(vmf->ptl);
>> __pmd_thp_fault_success_stats(vma);
>>
>> But in patch #2
>>
>> map_pmd_thp(folio, vmf, vma, haddr);
>> __pmd_thp_fault_success_stats(vma);
>> goto unlock;
>> release:
>>     folio_put(folio);
>> unlock:
>>     spin_unlock(vmf->ptl);
>>
>> Please make that consistent, meaning:
>>
>> 1) Inline __pmd_thp_fault_success_stats() into map_pmd_thp(). No need to
>> have the separated out.
>>
>> 2) Either do the PTL unlocking in __pmd_thp_fault_success_stats() or in
>>      the caller. In the caller is likely easiest. Adjusting the counters
>>      should be cheap, if not we could revisit this later with real data.
>>
>>> +
>>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> +            struct vm_area_struct *vma, unsigned long haddr)
>>> +{
>>> +    pmd_t entry;
>>> +
>>> +    entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>> +    entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>> +    folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>> +    folio_add_lru_vma(folio, vma);
>>> +    set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>> +    update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>
>> It's quite weird to see a mixture of haddr and vmf->address, and likely
>> this mixture is wrong or not not required.
>>
>> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see how
>> passing in the unaligned address would do the right thing. But maybe arc
>> also doesn't trigger that code path ... who knows :)
>>
>>
>> Staring at some other update_mmu_cache_pmd() users, it's quite
>> inconsistent. Primarily only do_huge_pmd_numa_page() and
>> __do_huge_pmd_anonymous_page() use the unaligned address. The others
>> seem to use the aligned address ... as one would expect when modifying a
>> PMD.
>>
>>
>> I suggest to change this function to *not* pass in the vmf, and rename
>> it to something like:
>>
>> static void folio_map_anon_pmd(struct folio *folio, struct
>> vm_area_struct *vma, pmd_t *pmd, unsigned long haddr)
>
> ... or better "map_anon_folio_pmd" so it better matches 
> vma_alloc_folio_pmd() suggested above.

I'll vote for this.
>
>

Dev Jain Sept. 11, 2024, 12:22 p.m. UTC | #6

On 9/11/24 16:22, Kefeng Wang wrote:
>
>
> On 2024/9/11 14:55, Dev Jain wrote:
>> In preparation for the second patch, abstract away the THP allocation
>> logic present in the create_huge_pmd() path, which corresponds to the
>> faulting case when no page is present.
>>
>> There should be no functional change as a result of applying
>> this patch.
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   mm/huge_memory.c | 110 +++++++++++++++++++++++++++++------------------
>>   1 file changed, 67 insertions(+), 43 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 67c86a5d64a6..b96a1ff2bf40 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -943,47 +943,88 @@ unsigned long thp_get_unmapped_area(struct file 
>> *filp, unsigned long addr,
>>   }
>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>   -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>> -            struct page *page, gfp_t gfp)
>> +static struct folio *pmd_thp_fault_alloc(gfp_t gfp, struct 
>> vm_area_struct *vma,
>> +                     unsigned long haddr, unsigned long addr)
>>   {
>> -    struct vm_area_struct *vma = vmf->vma;
>> -    struct folio *folio = page_folio(page);
>> -    pgtable_t pgtable;
>> -    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>> -    vm_fault_t ret = 0;
>> +    const int order = HPAGE_PMD_ORDER;
>
> Maybe move vma_thp_gfp_mask() into this function too.

That's better, thanks.
>
>> +    struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, 
>> true);
>>   -    VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>> +    if (unlikely(!folio)) {
>> +        count_vm_event(THP_FAULT_FALLBACK);
>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>> +        goto out;
>> +    }
>>   +    VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>       if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>>           folio_put(folio);
>>           count_vm_event(THP_FAULT_FALLBACK);
>>           count_vm_event(THP_FAULT_FALLBACK_CHARGE);
>> -        count_mthp_stat(HPAGE_PMD_ORDER, 
>> MTHP_STAT_ANON_FAULT_FALLBACK);
>> -        count_mthp_stat(HPAGE_PMD_ORDER, 
>> MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>> -        return VM_FAULT_FALLBACK;
>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>> +        goto out;
>
> We need to return NULL here as folio not set to null,

My bad for assuming that folio_put() also sets folio to NULL. I read
through the code path for that and I guess it does not. Thanks.
>
>>       }
>>       folio_throttle_swaprate(folio, gfp);
>>   -    pgtable = pte_alloc_one(vma->vm_mm);
>> -    if (unlikely(!pgtable)) {
>> -        ret = VM_FAULT_OOM;
>> -        goto release;
>> -    }
>> -
>> -    folio_zero_user(folio, vmf->address);
>> +    folio_zero_user(folio, addr);
>>       /*
>>        * The memory barrier inside __folio_mark_uptodate makes sure that
>>        * folio_zero_user writes become visible before the set_pmd_at()
>>        * write.
>>        */
>>       __folio_mark_uptodate(folio);
>> +out:
>> +    return folio;
>> +}
>> +

David Hildenbrand Sept. 11, 2024, 12:35 p.m. UTC | #7

[...]

>>
>>> +
>>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> +            struct vm_area_struct *vma, unsigned long haddr)
>>> +{
>>> +    pmd_t entry;
>>> +
>>> +    entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>> +    entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>> +    folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>> +    folio_add_lru_vma(folio, vma);
>>> +    set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>> +    update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>
>> It's quite weird to see a mixture of haddr and vmf->address, and
>> likely this mixture is wrong or not not required.
>>
>> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see
>> how passing in the unaligned address would do the right thing. But
>> maybe arc also doesn't trigger that code path ... who knows :)
>>
>>
>> Staring at some other update_mmu_cache_pmd() users, it's quite
>> inconsistent. Primarily only do_huge_pmd_numa_page() and
>> __do_huge_pmd_anonymous_page() use the unaligned address. The others
>> seem to use the aligned address ... as one would expect when modifying
>> a PMD.
>>
>>
>> I suggest to change this function to *not* pass in the vmf, and rename
>> it to something like:
>>
>> static void folio_map_anon_pmd(struct folio *folio, struct
>> vm_area_struct *vma, pmd_t *pmd, unsigned long haddr)
>>
>> Then use haddr also to do the update_mmu_cache_pmd().
> 
> The code I changed, already was passing vmf->address to
> update_mmu_cache_pmd().
> I did not change any of the haddr and vmf->address semantics, so really
> can't comment
> on this.

I'm not saying that you changed it. I say, "we touch it we fix it" :). 
We could do that in a separate cleanup patch upfront.

> I agree with the name change; vmf will be required for
> set_pmd_at(vmf->pmd), so I should
> just pass pmd?
>>
>>> +    add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>> +    mm_inc_nr_ptes(vma->vm_mm);
>>> +}
>>> +
>>> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>> +{
>>> +    struct vm_area_struct *vma = vmf->vma;
>>> +    struct folio *folio;
>>> +    pgtable_t pgtable;
>>> +    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>> +    vm_fault_t ret = 0;
>>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>>
>> Nit: While at it, try to use reverse christmas-tree where possible,
>> makes things more reasible. You could make haddr const.
>>
>> struct vm_area_struct *vma = vmf->vma;
>> unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>> gfp_t gfp = vma_thp_gfp_mask(vma);
>> struct folio *folio;
>> vm_fault_t ret = 0;
> 
> Okay.
>> ...
>>
>>> +
>>> +    folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address);
>>> +    if (unlikely(!folio)) {
>>> +        ret = VM_FAULT_FALLBACK;
>>> +        goto release;
>>> +    }
>>> +
>>> +    pgtable = pte_alloc_one(vma->vm_mm);
>>> +    if (unlikely(!pgtable)) {
>>> +        ret = VM_FAULT_OOM;
>>> +        goto release;
>>> +    }
>>>          vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>> +
>>
>> Nit Unrelated change.
> 
> Are you asking me to align this line with the below line?

No, just pointing out that you add a newline in a code section you don't 
really touch. Sometimes that's deliberate, sometimes not (e.g., going 
back and forth while reworking the code and accidentally leaving that 
in). We try to avoid such unrelated changes because it adds noise to the 
patches.

If it was deliberate, feel free to leave it in.

Dev Jain Sept. 11, 2024, 12:53 p.m. UTC | #8

On 9/11/24 14:57, David Hildenbrand wrote:
> On 11.09.24 08:55, Dev Jain wrote:
>> In preparation for the second patch, abstract away the THP allocation
>> logic present in the create_huge_pmd() path, which corresponds to the
>> faulting case when no page is present.
>>
>> There should be no functional change as a result of applying
>> this patch.
>>
>
> [...]
>
>> +
>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +            struct vm_area_struct *vma, unsigned long haddr)
>> +{
>> +    pmd_t entry;
>> +
>> +    entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>> +    entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>> +    folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>> +    folio_add_lru_vma(folio, vma);
>> +    set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>> +    update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>
> It's quite weird to see a mixture of haddr and vmf->address, and 
> likely this mixture is wrong or not not required.
>
> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see 
> how passing in the unaligned address would do the right thing. But 
> maybe arc also doesn't trigger that code path ... who knows :)

If I am reading correctly, arch/arc/mm/tlb.c: update_mmu_cache_pmd() 
calls update_mmu_cache_range() which is already expecting an unaligned 
address? But...
>
>
> Staring at some other update_mmu_cache_pmd() users, it's quite 
> inconsistent. Primarily only do_huge_pmd_numa_page() and 
> __do_huge_pmd_anonymous_page() use the unaligned address. The others 
> seem to use the aligned address ... as one would expect when modifying 
> a PMD.

Looking at riscv: 
update_mmu_cache_pmd()->update_mmu_cache()->update_mmu_cache_range(). 
The argument getting passed to local_flush_tlb_page() seems like, should 
expect an aligned address.
>
>
> I suggest to change this function to *not* pass in the vmf, and rename 
> it to something like:
>
> static void folio_map_anon_pmd(struct folio *folio, struct 
> vm_area_struct *vma, pmd_t *pmd, unsigned long haddr)
>
> Then use haddr also to do the update_mmu_cache_pmd().
>
>> +    add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>> +    mm_inc_nr_ptes(vma->vm_mm);
>> +}
>> +
>> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>> +{
>> +    struct vm_area_struct *vma = vmf->vma;
>> +    struct folio *folio;
>> +    pgtable_t pgtable;
>> +    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>> +    vm_fault_t ret = 0;
>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>
> Nit: While at it, try to use reverse christmas-tree where possible, 
> makes things more reasible. You could make haddr const.
>
> struct vm_area_struct *vma = vmf->vma;
> unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> gfp_t gfp = vma_thp_gfp_mask(vma);
> struct folio *folio;
> vm_fault_t ret = 0;
> ...
>
>> +
>> +    folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address);
>> +    if (unlikely(!folio)) {
>> +        ret = VM_FAULT_FALLBACK;
>> +        goto release;
>> +    }
>> +
>> +    pgtable = pte_alloc_one(vma->vm_mm);
>> +    if (unlikely(!pgtable)) {
>> +        ret = VM_FAULT_OOM;
>> +        goto release;
>> +    }
>>         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>> +
>
> Nit Unrelated change.
>
>>       if (unlikely(!pmd_none(*vmf->pmd))) {
>>           goto unlock_release;
>
>

Dev Jain Sept. 11, 2024, 12:55 p.m. UTC | #9

On 9/11/24 18:05, David Hildenbrand wrote:
> [...]
>
>>>
>>>> +
>>>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>> +            struct vm_area_struct *vma, unsigned long haddr)
>>>> +{
>>>> +    pmd_t entry;
>>>> +
>>>> +    entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>>> +    entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>> +    folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>> +    folio_add_lru_vma(folio, vma);
>>>> +    set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>> +    update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>
>>> It's quite weird to see a mixture of haddr and vmf->address, and
>>> likely this mixture is wrong or not not required.
>>>
>>> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see
>>> how passing in the unaligned address would do the right thing. But
>>> maybe arc also doesn't trigger that code path ... who knows :)
>>>
>>>
>>> Staring at some other update_mmu_cache_pmd() users, it's quite
>>> inconsistent. Primarily only do_huge_pmd_numa_page() and
>>> __do_huge_pmd_anonymous_page() use the unaligned address. The others
>>> seem to use the aligned address ... as one would expect when modifying
>>> a PMD.
>>>
>>>
>>> I suggest to change this function to *not* pass in the vmf, and rename
>>> it to something like:
>>>
>>> static void folio_map_anon_pmd(struct folio *folio, struct
>>> vm_area_struct *vma, pmd_t *pmd, unsigned long haddr)
>>>
>>> Then use haddr also to do the update_mmu_cache_pmd().
>>
>> The code I changed, already was passing vmf->address to
>> update_mmu_cache_pmd().
>> I did not change any of the haddr and vmf->address semantics, so really
>> can't comment
>> on this.
>
> I'm not saying that you changed it. I say, "we touch it we fix it" :). 
> We could do that in a separate cleanup patch upfront.

Sure.
>
>> I agree with the name change; vmf will be required for
>> set_pmd_at(vmf->pmd), so I should
>> just pass pmd?
>>>
>>>> +    add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>>> +    mm_inc_nr_ptes(vma->vm_mm);
>>>> +}
>>>> +
>>>> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>>> +{
>>>> +    struct vm_area_struct *vma = vmf->vma;
>>>> +    struct folio *folio;
>>>> +    pgtable_t pgtable;
>>>> +    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>>> +    vm_fault_t ret = 0;
>>>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>>>
>>> Nit: While at it, try to use reverse christmas-tree where possible,
>>> makes things more reasible. You could make haddr const.
>>>
>>> struct vm_area_struct *vma = vmf->vma;
>>> unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>> gfp_t gfp = vma_thp_gfp_mask(vma);
>>> struct folio *folio;
>>> vm_fault_t ret = 0;
>>
>> Okay.
>>> ...
>>>
>>>> +
>>>> +    folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address);
>>>> +    if (unlikely(!folio)) {
>>>> +        ret = VM_FAULT_FALLBACK;
>>>> +        goto release;
>>>> +    }
>>>> +
>>>> +    pgtable = pte_alloc_one(vma->vm_mm);
>>>> +    if (unlikely(!pgtable)) {
>>>> +        ret = VM_FAULT_OOM;
>>>> +        goto release;
>>>> +    }
>>>>          vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>>> +
>>>
>>> Nit Unrelated change.
>>
>> Are you asking me to align this line with the below line?
>
> No, just pointing out that you add a newline in a code section you 
> don't really touch. Sometimes that's deliberate, sometimes not (e.g., 
> going back and forth while reworking the code and accidentally leaving 
> that in). We try to avoid such unrelated changes because it adds noise 
> to the patches.
>
> If it was deliberate, feel free to leave it in.

Ah, I accidentally inserted that new line. Will revert that.

David Hildenbrand Sept. 11, 2024, 1 p.m. UTC | #10

On 11.09.24 14:53, Dev Jain wrote:
> 
> On 9/11/24 14:57, David Hildenbrand wrote:
>> On 11.09.24 08:55, Dev Jain wrote:
>>> In preparation for the second patch, abstract away the THP allocation
>>> logic present in the create_huge_pmd() path, which corresponds to the
>>> faulting case when no page is present.
>>>
>>> There should be no functional change as a result of applying
>>> this patch.
>>>
>>
>> [...]
>>
>>> +
>>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> +            struct vm_area_struct *vma, unsigned long haddr)
>>> +{
>>> +    pmd_t entry;
>>> +
>>> +    entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>> +    entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>> +    folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>> +    folio_add_lru_vma(folio, vma);
>>> +    set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>> +    update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>
>> It's quite weird to see a mixture of haddr and vmf->address, and
>> likely this mixture is wrong or not not required.
>>
>> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see
>> how passing in the unaligned address would do the right thing. But
>> maybe arc also doesn't trigger that code path ... who knows :)
> 
> If I am reading correctly, arch/arc/mm/tlb.c: update_mmu_cache_pmd()
> calls update_mmu_cache_range() which is already expecting an unaligned
> address? But...

So update_mmu_cache_pmd() calls

	update_mmu_cache_range(NULL, vma, addr, &pte, HPAGE_PMD_NR);

But update_mmu_cache_range() only aligns it *to page boundary*:

	unsigned long vaddr = vaddr_unaligned & PAGE_MASK;

We obtain the correct hugepage-aligned physical address from the PTE

	phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK_PHYS;

Then, we look at the offset in our folio

	unsigned long offset = offset_in_folio(folio, paddr);

And adjust both vaddr and paddr

	paddr -= offset;
	vaddr -= offset;

To then use that combination with

	__inv_icache_pages(paddr, vaddr, nr);

If I am not wrong, getting a non-hugepage aligned vaddr messes up things 
here. But only regarding the icache I think.

Dev Jain Sept. 11, 2024, 1:05 p.m. UTC | #11

On 9/11/24 18:30, David Hildenbrand wrote:
> On 11.09.24 14:53, Dev Jain wrote:
>>
>> On 9/11/24 14:57, David Hildenbrand wrote:
>>> On 11.09.24 08:55, Dev Jain wrote:
>>>> In preparation for the second patch, abstract away the THP allocation
>>>> logic present in the create_huge_pmd() path, which corresponds to the
>>>> faulting case when no page is present.
>>>>
>>>> There should be no functional change as a result of applying
>>>> this patch.
>>>>
>>>
>>> [...]
>>>
>>>> +
>>>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>> +            struct vm_area_struct *vma, unsigned long haddr)
>>>> +{
>>>> +    pmd_t entry;
>>>> +
>>>> +    entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>>> +    entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>> +    folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>> +    folio_add_lru_vma(folio, vma);
>>>> +    set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>> +    update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>
>>> It's quite weird to see a mixture of haddr and vmf->address, and
>>> likely this mixture is wrong or not not required.
>>>
>>> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see
>>> how passing in the unaligned address would do the right thing. But
>>> maybe arc also doesn't trigger that code path ... who knows :)
>>
>> If I am reading correctly, arch/arc/mm/tlb.c: update_mmu_cache_pmd()
>> calls update_mmu_cache_range() which is already expecting an unaligned
>> address? But...
>
> So update_mmu_cache_pmd() calls
>
>     update_mmu_cache_range(NULL, vma, addr, &pte, HPAGE_PMD_NR);
>
> But update_mmu_cache_range() only aligns it *to page boundary*:
>
>     unsigned long vaddr = vaddr_unaligned & PAGE_MASK;

Ah, totally missed that it was PAGE_MASK. Thanks.
>
> We obtain the correct hugepage-aligned physical address from the PTE
>
>     phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK_PHYS;
>
> Then, we look at the offset in our folio
>
>     unsigned long offset = offset_in_folio(folio, paddr);
>
> And adjust both vaddr and paddr
>
>     paddr -= offset;
>     vaddr -= offset;
>
> To then use that combination with
>
>     __inv_icache_pages(paddr, vaddr, nr);
>
> If I am not wrong, getting a non-hugepage aligned vaddr messes up 
> things here. But only regarding the icache I think.

Looks like it...
>
>

David Hildenbrand Sept. 11, 2024, 1:14 p.m. UTC | #12

On 11.09.24 15:05, Dev Jain wrote:
> 
> On 9/11/24 18:30, David Hildenbrand wrote:
>> On 11.09.24 14:53, Dev Jain wrote:
>>>
>>> On 9/11/24 14:57, David Hildenbrand wrote:
>>>> On 11.09.24 08:55, Dev Jain wrote:
>>>>> In preparation for the second patch, abstract away the THP allocation
>>>>> logic present in the create_huge_pmd() path, which corresponds to the
>>>>> faulting case when no page is present.
>>>>>
>>>>> There should be no functional change as a result of applying
>>>>> this patch.
>>>>>
>>>>
>>>> [...]
>>>>
>>>>> +
>>>>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>>> +            struct vm_area_struct *vma, unsigned long haddr)
>>>>> +{
>>>>> +    pmd_t entry;
>>>>> +
>>>>> +    entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>>>> +    entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>>> +    folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>>> +    folio_add_lru_vma(folio, vma);
>>>>> +    set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>>> +    update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>>
>>>> It's quite weird to see a mixture of haddr and vmf->address, and
>>>> likely this mixture is wrong or not not required.
>>>>
>>>> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see
>>>> how passing in the unaligned address would do the right thing. But
>>>> maybe arc also doesn't trigger that code path ... who knows :)
>>>
>>> If I am reading correctly, arch/arc/mm/tlb.c: update_mmu_cache_pmd()
>>> calls update_mmu_cache_range() which is already expecting an unaligned
>>> address? But...
>>
>> So update_mmu_cache_pmd() calls
>>
>>      update_mmu_cache_range(NULL, vma, addr, &pte, HPAGE_PMD_NR);
>>
>> But update_mmu_cache_range() only aligns it *to page boundary*:
>>
>>      unsigned long vaddr = vaddr_unaligned & PAGE_MASK;
> 
> Ah, totally missed that it was PAGE_MASK. Thanks.
>>
>> We obtain the correct hugepage-aligned physical address from the PTE
>>
>>      phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK_PHYS;
>>
>> Then, we look at the offset in our folio
>>
>>      unsigned long offset = offset_in_folio(folio, paddr);
>>
>> And adjust both vaddr and paddr
>>
>>      paddr -= offset;
>>      vaddr -= offset;
>>
>> To then use that combination with
>>
>>      __inv_icache_pages(paddr, vaddr, nr);
>>
>> If I am not wrong, getting a non-hugepage aligned vaddr messes up
>> things here. But only regarding the icache I think.
> 
> Looks like it...

As we are adding a fresh page where there previously wasn't anything 
mapped (no icache invaldiation required?), and because most anon 
mappings are not executable, maybe that's why nobody notices so far.

Dev Jain Sept. 11, 2024, 1:16 p.m. UTC | #13

On 9/11/24 18:44, David Hildenbrand wrote:
> On 11.09.24 15:05, Dev Jain wrote:
>>
>> On 9/11/24 18:30, David Hildenbrand wrote:
>>> On 11.09.24 14:53, Dev Jain wrote:
>>>>
>>>> On 9/11/24 14:57, David Hildenbrand wrote:
>>>>> On 11.09.24 08:55, Dev Jain wrote:
>>>>>> In preparation for the second patch, abstract away the THP 
>>>>>> allocation
>>>>>> logic present in the create_huge_pmd() path, which corresponds to 
>>>>>> the
>>>>>> faulting case when no page is present.
>>>>>>
>>>>>> There should be no functional change as a result of applying
>>>>>> this patch.
>>>>>>
>>>>>
>>>>> [...]
>>>>>
>>>>>> +
>>>>>> +static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>>>> +            struct vm_area_struct *vma, unsigned long haddr)
>>>>>> +{
>>>>>> +    pmd_t entry;
>>>>>> +
>>>>>> +    entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>>>>> +    entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>>>> +    folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>>>> +    folio_add_lru_vma(folio, vma);
>>>>>> +    set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>>>> +    update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>>>
>>>>> It's quite weird to see a mixture of haddr and vmf->address, and
>>>>> likely this mixture is wrong or not not required.
>>>>>
>>>>> Looking at arc's update_mmu_cache_pmd() implementation, I cannot see
>>>>> how passing in the unaligned address would do the right thing. But
>>>>> maybe arc also doesn't trigger that code path ... who knows :)
>>>>
>>>> If I am reading correctly, arch/arc/mm/tlb.c: update_mmu_cache_pmd()
>>>> calls update_mmu_cache_range() which is already expecting an unaligned
>>>> address? But...
>>>
>>> So update_mmu_cache_pmd() calls
>>>
>>>      update_mmu_cache_range(NULL, vma, addr, &pte, HPAGE_PMD_NR);
>>>
>>> But update_mmu_cache_range() only aligns it *to page boundary*:
>>>
>>>      unsigned long vaddr = vaddr_unaligned & PAGE_MASK;
>>
>> Ah, totally missed that it was PAGE_MASK. Thanks.
>>>
>>> We obtain the correct hugepage-aligned physical address from the PTE
>>>
>>>      phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK_PHYS;
>>>
>>> Then, we look at the offset in our folio
>>>
>>>      unsigned long offset = offset_in_folio(folio, paddr);
>>>
>>> And adjust both vaddr and paddr
>>>
>>>      paddr -= offset;
>>>      vaddr -= offset;
>>>
>>> To then use that combination with
>>>
>>>      __inv_icache_pages(paddr, vaddr, nr);
>>>
>>> If I am not wrong, getting a non-hugepage aligned vaddr messes up
>>> things here. But only regarding the icache I think.
>>
>> Looks like it...
>
> As we are adding a fresh page where there previously wasn't anything 
> mapped (no icache invaldiation required?), and because most anon 
> mappings are not executable, maybe that's why nobody notices so far.

Thanks for the observation!

kernel test robot Sept. 12, 2024, 1:26 p.m. UTC | #14

Hi Dev,

kernel test robot noticed the following build warnings:

[auto build test WARNING on v6.11-rc7]
[also build test WARNING on linus/master]
[cannot apply to akpm-mm/mm-everything next-20240912]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Dev-Jain/mm-Abstract-THP-allocation/20240911-145809
base:   v6.11-rc7
patch link:    https://lore.kernel.org/r/20240911065600.1002644-2-dev.jain%40arm.com
patch subject: [PATCH v3 1/2] mm: Abstract THP allocation
config: x86_64-rhel-8.3-rust (https://download.01.org/0day-ci/archive/20240912/202409122144.jqe4JROY-lkp@intel.com/config)
compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240912/202409122144.jqe4JROY-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409122144.jqe4JROY-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> mm/huge_memory.c:1012:6: warning: variable 'pgtable' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
    1012 |         if (unlikely(!folio)) {
         |             ^~~~~~~~~~~~~~~~
   include/linux/compiler.h:77:22: note: expanded from macro 'unlikely'
      77 | # define unlikely(x)    __builtin_expect(!!(x), 0)
         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~
   mm/huge_memory.c:1051:6: note: uninitialized use occurs here
    1051 |         if (pgtable)
         |             ^~~~~~~
   mm/huge_memory.c:1012:2: note: remove the 'if' if its condition is always false
    1012 |         if (unlikely(!folio)) {
         |         ^~~~~~~~~~~~~~~~~~~~~~~
    1013 |                 ret = VM_FAULT_FALLBACK;
         |                 ~~~~~~~~~~~~~~~~~~~~~~~~
    1014 |                 goto release;
         |                 ~~~~~~~~~~~~~
    1015 |         }
         |         ~
   mm/huge_memory.c:1006:19: note: initialize the variable 'pgtable' to silence this warning
    1006 |         pgtable_t pgtable;
         |                          ^
         |                           = NULL
   1 warning generated.


vim +1012 mm/huge_memory.c

  1001	
  1002	static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
  1003	{
  1004		struct vm_area_struct *vma = vmf->vma;
  1005		struct folio *folio;
  1006		pgtable_t pgtable;
  1007		unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
  1008		vm_fault_t ret = 0;
  1009		gfp_t gfp = vma_thp_gfp_mask(vma);
  1010	
  1011		folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address);
> 1012		if (unlikely(!folio)) {
  1013			ret = VM_FAULT_FALLBACK;
  1014			goto release;
  1015		}
  1016	
  1017		pgtable = pte_alloc_one(vma->vm_mm);
  1018		if (unlikely(!pgtable)) {
  1019			ret = VM_FAULT_OOM;
  1020			goto release;
  1021		}
  1022	
  1023		vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
  1024	
  1025		if (unlikely(!pmd_none(*vmf->pmd))) {
  1026			goto unlock_release;
  1027		} else {
  1028			ret = check_stable_address_space(vma->vm_mm);
  1029			if (ret)
  1030				goto unlock_release;
  1031	
  1032			/* Deliver the page fault to userland */
  1033			if (userfaultfd_missing(vma)) {
  1034				spin_unlock(vmf->ptl);
  1035				folio_put(folio);
  1036				pte_free(vma->vm_mm, pgtable);
  1037				ret = handle_userfault(vmf, VM_UFFD_MISSING);
  1038				VM_BUG_ON(ret & VM_FAULT_FALLBACK);
  1039				return ret;
  1040			}
  1041			pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
  1042			map_pmd_thp(folio, vmf, vma, haddr);
  1043			spin_unlock(vmf->ptl);
  1044			__pmd_thp_fault_success_stats(vma);
  1045		}
  1046	
  1047		return 0;
  1048	unlock_release:
  1049		spin_unlock(vmf->ptl);
  1050	release:
  1051		if (pgtable)
  1052			pte_free(vma->vm_mm, pgtable);
  1053		if (folio)
  1054			folio_put(folio);
  1055		return ret;
  1056

[v3,1/2] mm: Abstract THP allocation

Commit Message

Comments

Patch