diff mbox series

[v2,2/2] mm: Allocate THP on hugezeropage wp-fault

Message ID 20240904100923.290042-3-dev.jain@arm.com (mailing list archive)
State New, archived
Headers show
Series Do not shatter hugezeropage on wp-fault | expand

Commit Message

Dev Jain Sept. 4, 2024, 10:09 a.m. UTC
Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
replace it with a PMD-mapped THP. Change the helpers introduced in the
previous patch to flush TLB entry corresponding to the hugezeropage,
and preserve PMD uffd-wp marker. In case of failure, fallback to
splitting the PMD.

Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 include/linux/huge_mm.h |  6 ++++
 mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
 mm/memory.c             |  5 +--
 3 files changed, 78 insertions(+), 12 deletions(-)

Comments

Kirill A . Shutemov Sept. 5, 2024, 8:26 a.m. UTC | #1
On Wed, Sep 04, 2024 at 03:39:23PM +0530, Dev Jain wrote:
> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
> replace it with a PMD-mapped THP. Change the helpers introduced in the
> previous patch to flush TLB entry corresponding to the hugezeropage,
> and preserve PMD uffd-wp marker. In case of failure, fallback to
> splitting the PMD.
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  include/linux/huge_mm.h |  6 ++++
>  mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>  mm/memory.c             |  5 +--
>  3 files changed, 78 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index e25d9ebfdf89..fdd2cf473a3c 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -9,6 +9,12 @@
>  #include <linux/kobject.h>
>  
>  vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
> +			   unsigned long haddr, struct folio **foliop,
> +			   unsigned long addr);
> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
> +		 struct vm_area_struct *vma, unsigned long haddr,
> +		 pgtable_t pgtable);

Why? I don't see users outside huge_memory.c.

>  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>  		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>  		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 58125fbcc532..150163ad77d3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>  }
>  EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>  
> -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
> -				  unsigned long haddr, struct folio **foliop,
> -				  unsigned long addr)
> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
> +			   unsigned long haddr, struct folio **foliop,
> +			   unsigned long addr)
>  {
>  	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>  
> @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
>  	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>  }
>  
> -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
> -			struct vm_area_struct *vma, unsigned long haddr,
> -			pgtable_t pgtable)
> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
> +		 struct vm_area_struct *vma, unsigned long haddr,
> +		 pgtable_t pgtable)
>  {
> -	pmd_t entry;
> +	pmd_t entry, old_pmd;
> +	bool is_pmd_none = pmd_none(*vmf->pmd);
>  
>  	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>  	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>  	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>  	folio_add_lru_vma(folio, vma);
> -	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
> +	if (!is_pmd_none) {
> +		old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
> +		if (pmd_uffd_wp(old_pmd))
> +			entry = pmd_mkuffd_wp(entry);
> +	}
> +	if (pgtable)
> +		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>  	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>  	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>  	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> -	mm_inc_nr_ptes(vma->vm_mm);
> +	if (is_pmd_none)
> +		mm_inc_nr_ptes(vma->vm_mm);
>  }
>  
>  static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>  	spin_unlock(vmf->ptl);
>  }
>  
> +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
> +					     unsigned long haddr,
> +					     struct folio *folio)

Why the helper is needed? Cannot it be just opencodded in
do_huge_zero_wp_pmd()?

> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	vm_fault_t ret = 0;
> +
> +	ret = check_stable_address_space(vma->vm_mm);
> +	if (ret)
> +		goto out;
> +	map_pmd_thp(folio, vmf, vma, haddr, NULL);
> +out:
> +	return ret;
> +}
> +
> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	gfp_t gfp = vma_thp_gfp_mask(vma);
> +	struct mmu_notifier_range range;
> +	struct folio *folio = NULL;
> +	vm_fault_t ret = 0;
> +
> +	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
> +			      vmf->address);
> +	if (ret)
> +		goto out;
> +
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
> +				haddr + HPAGE_PMD_SIZE);
> +	mmu_notifier_invalidate_range_start(&range);
> +	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> +	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
> +		goto unlock;
> +	ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
> +	if (!ret)
> +		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
> +unlock:
> +	spin_unlock(vmf->ptl);
> +	mmu_notifier_invalidate_range_end(&range);
> +out:
> +	return ret;
> +}
> +
>  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>  {
>  	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>  	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>  	VM_BUG_ON_VMA(!vma->anon_vma, vma);
>  
> -	if (is_huge_zero_pmd(orig_pmd))
> +	if (is_huge_zero_pmd(orig_pmd)) {
> +		vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
> +
> +		if (!(ret & VM_FAULT_FALLBACK))
> +			return ret;
> +
> +		/* Fallback to splitting PMD if THP cannot be allocated */
>  		goto fallback;
> +	}
>  
>  	spin_lock(vmf->ptl);
>  
> diff --git a/mm/memory.c b/mm/memory.c
> index 3c01d68065be..c081a25f5173 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
>  	if (vma_is_anonymous(vma)) {
>  		if (likely(!unshare) &&
>  		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
> -			if (userfaultfd_wp_async(vmf->vma))
> +			if (!userfaultfd_wp_async(vmf->vma))
> +				return handle_userfault(vmf, VM_UFFD_WP);
> +			if (!is_huge_zero_pmd(vmf->orig_pmd))
>  				goto split;
> -			return handle_userfault(vmf, VM_UFFD_WP);
>  		}
>  		return do_huge_pmd_wp_page(vmf);
>  	}
> -- 
> 2.30.2
>
Dev Jain Sept. 5, 2024, 8:52 a.m. UTC | #2
On 9/5/24 13:56, Kirill A. Shutemov wrote:
> On Wed, Sep 04, 2024 at 03:39:23PM +0530, Dev Jain wrote:
>> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
>> replace it with a PMD-mapped THP. Change the helpers introduced in the
>> previous patch to flush TLB entry corresponding to the hugezeropage,
>> and preserve PMD uffd-wp marker. In case of failure, fallback to
>> splitting the PMD.
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   include/linux/huge_mm.h |  6 ++++
>>   mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>>   mm/memory.c             |  5 +--
>>   3 files changed, 78 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index e25d9ebfdf89..fdd2cf473a3c 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -9,6 +9,12 @@
>>   #include <linux/kobject.h>
>>   
>>   vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> +			   unsigned long haddr, struct folio **foliop,
>> +			   unsigned long addr);
>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +		 struct vm_area_struct *vma, unsigned long haddr,
>> +		 pgtable_t pgtable);
> Why? I don't see users outside huge_memory.c.


Ah sorry! When I started out on this I may have planned to use
it in mm/memory.c, but then completely forgot to drop this.

>
>>   int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>   		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>>   		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 58125fbcc532..150163ad77d3 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>>   }
>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>   
>> -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> -				  unsigned long haddr, struct folio **foliop,
>> -				  unsigned long addr)
>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> +			   unsigned long haddr, struct folio **foliop,
>> +			   unsigned long addr)
>>   {
>>   	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>   
>> @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
>>   	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>   }
>>   
>> -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> -			struct vm_area_struct *vma, unsigned long haddr,
>> -			pgtable_t pgtable)
>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +		 struct vm_area_struct *vma, unsigned long haddr,
>> +		 pgtable_t pgtable)
>>   {
>> -	pmd_t entry;
>> +	pmd_t entry, old_pmd;
>> +	bool is_pmd_none = pmd_none(*vmf->pmd);
>>   
>>   	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>   	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>   	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>   	folio_add_lru_vma(folio, vma);
>> -	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>> +	if (!is_pmd_none) {
>> +		old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
>> +		if (pmd_uffd_wp(old_pmd))
>> +			entry = pmd_mkuffd_wp(entry);
>> +	}
>> +	if (pgtable)
>> +		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>   	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>   	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>   	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>> -	mm_inc_nr_ptes(vma->vm_mm);
>> +	if (is_pmd_none)
>> +		mm_inc_nr_ptes(vma->vm_mm);
>>   }
>>   
>>   static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>   	spin_unlock(vmf->ptl);
>>   }
>>   
>> +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
>> +					     unsigned long haddr,
>> +					     struct folio *folio)
> Why the helper is needed? Cannot it be just opencodded in
> do_huge_zero_wp_pmd()?

I was taking inspiration from split_huge_pud_locked() and the like.
Although I guess I should open code it, since do_huge_zero_wp_pmd_locked()
is small anyways.

>
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	vm_fault_t ret = 0;
>> +
>> +	ret = check_stable_address_space(vma->vm_mm);
>> +	if (ret)
>> +		goto out;
>> +	map_pmd_thp(folio, vmf, vma, haddr, NULL);
>> +out:
>> +	return ret;
>> +}
>> +
>> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	gfp_t gfp = vma_thp_gfp_mask(vma);
>> +	struct mmu_notifier_range range;
>> +	struct folio *folio = NULL;
>> +	vm_fault_t ret = 0;
>> +
>> +	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>> +			      vmf->address);
>> +	if (ret)
>> +		goto out;
>> +
>> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
>> +				haddr + HPAGE_PMD_SIZE);
>> +	mmu_notifier_invalidate_range_start(&range);
>> +	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>> +	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
>> +		goto unlock;
>> +	ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
>> +	if (!ret)
>> +		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>> +unlock:
>> +	spin_unlock(vmf->ptl);
>> +	mmu_notifier_invalidate_range_end(&range);
>> +out:
>> +	return ret;
>> +}
>> +
>>   vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>   {
>>   	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
>> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>   	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>>   	VM_BUG_ON_VMA(!vma->anon_vma, vma);
>>   
>> -	if (is_huge_zero_pmd(orig_pmd))
>> +	if (is_huge_zero_pmd(orig_pmd)) {
>> +		vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
>> +
>> +		if (!(ret & VM_FAULT_FALLBACK))
>> +			return ret;
>> +
>> +		/* Fallback to splitting PMD if THP cannot be allocated */
>>   		goto fallback;
>> +	}
>>   
>>   	spin_lock(vmf->ptl);
>>   
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 3c01d68065be..c081a25f5173 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
>>   	if (vma_is_anonymous(vma)) {
>>   		if (likely(!unshare) &&
>>   		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
>> -			if (userfaultfd_wp_async(vmf->vma))
>> +			if (!userfaultfd_wp_async(vmf->vma))
>> +				return handle_userfault(vmf, VM_UFFD_WP);
>> +			if (!is_huge_zero_pmd(vmf->orig_pmd))
>>   				goto split;
>> -			return handle_userfault(vmf, VM_UFFD_WP);
>>   		}
>>   		return do_huge_pmd_wp_page(vmf);
>>   	}
>> -- 
>> 2.30.2
>>
Kefeng Wang Sept. 5, 2024, 9:41 a.m. UTC | #3
On 2024/9/5 16:26, Kirill A. Shutemov wrote:
> On Wed, Sep 04, 2024 at 03:39:23PM +0530, Dev Jain wrote:
>> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
>> replace it with a PMD-mapped THP. Change the helpers introduced in the
>> previous patch to flush TLB entry corresponding to the hugezeropage,
>> and preserve PMD uffd-wp marker. In case of failure, fallback to
>> splitting the PMD.

We met same issue, and a very similar function in our kernel,
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   include/linux/huge_mm.h |  6 ++++
>>   mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>>   mm/memory.c             |  5 +--
>>   3 files changed, 78 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index e25d9ebfdf89..fdd2cf473a3c 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -9,6 +9,12 @@
>>   #include <linux/kobject.h>
>>   
>>   vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> +			   unsigned long haddr, struct folio **foliop,
>> +			   unsigned long addr);
>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +		 struct vm_area_struct *vma, unsigned long haddr,
>> +		 pgtable_t pgtable);
> 
> Why? I don't see users outside huge_memory.c.
> 
>>   int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>   		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>>   		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 58125fbcc532..150163ad77d3 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>>   }
>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>   
>> -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> -				  unsigned long haddr, struct folio **foliop,
>> -				  unsigned long addr)
>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> +			   unsigned long haddr, struct folio **foliop,
>> +			   unsigned long addr)
>>   {
>>   	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>   
>> @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
>>   	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>   }
>>   
>> -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> -			struct vm_area_struct *vma, unsigned long haddr,
>> -			pgtable_t pgtable)
>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +		 struct vm_area_struct *vma, unsigned long haddr,
>> +		 pgtable_t pgtable)
>>   {
>> -	pmd_t entry;
>> +	pmd_t entry, old_pmd;
>> +	bool is_pmd_none = pmd_none(*vmf->pmd);
>>   
>>   	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>   	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>   	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>   	folio_add_lru_vma(folio, vma);
>> -	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>> +	if (!is_pmd_none) {
>> +		old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
>> +		if (pmd_uffd_wp(old_pmd))
>> +			entry = pmd_mkuffd_wp(entry);
>> +	}
>> +	if (pgtable)
>> +		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>   	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>   	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>   	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>> -	mm_inc_nr_ptes(vma->vm_mm);
>> +	if (is_pmd_none)
>> +		mm_inc_nr_ptes(vma->vm_mm);
>>   }
>>   
>>   static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>   	spin_unlock(vmf->ptl);
>>   }
>>   
>> +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
>> +					     unsigned long haddr,
>> +					     struct folio *folio)
> 
> Why the helper is needed? Cannot it be just opencodded in
> do_huge_zero_wp_pmd()?
> 
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	vm_fault_t ret = 0;
>> +
>> +	ret = check_stable_address_space(vma->vm_mm);
>> +	if (ret)
>> +		goto out;
>> +	map_pmd_thp(folio, vmf, vma, haddr, NULL);
>> +out:
>> +	return ret;
>> +}
>> +
>> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	gfp_t gfp = vma_thp_gfp_mask(vma);
>> +	struct mmu_notifier_range range;
>> +	struct folio *folio = NULL;
>> +	vm_fault_t ret = 0;
>> +
>> +	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>> +			      vmf->address);
>> +	if (ret)
>> +		goto out;
>> +
>> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
>> +				haddr + HPAGE_PMD_SIZE);
>> +	mmu_notifier_invalidate_range_start(&range);
>> +	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>> +	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
>> +		goto unlock;
>> +	ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
>> +	if (!ret)
>> +		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>> +unlock:
>> +	spin_unlock(vmf->ptl);
>> +	mmu_notifier_invalidate_range_end(&range);

the folio need to released when !pmd_same() and 
check_stable_address_space() return VM_FAULT_SIGBUS.

>> +out:
>> +	return ret;
>> +}
>> +
>>   vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>   {
>>   	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
>> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>   	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>>   	VM_BUG_ON_VMA(!vma->anon_vma, vma);
>>   
>> -	if (is_huge_zero_pmd(orig_pmd))
>> +	if (is_huge_zero_pmd(orig_pmd)) {
>> +		vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
>> +
>> +		if (!(ret & VM_FAULT_FALLBACK))
>> +			return ret;
>> +
>> +		/* Fallback to splitting PMD if THP cannot be allocated */
>>   		goto fallback;
>> +	}
>>   
>>   	spin_lock(vmf->ptl);
>>   
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 3c01d68065be..c081a25f5173 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
>>   	if (vma_is_anonymous(vma)) {
>>   		if (likely(!unshare) &&
>>   		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
>> -			if (userfaultfd_wp_async(vmf->vma))
>> +			if (!userfaultfd_wp_async(vmf->vma))
>> +				return handle_userfault(vmf, VM_UFFD_WP);
>> +			if (!is_huge_zero_pmd(vmf->orig_pmd))
>>   				goto split;
>> -			return handle_userfault(vmf, VM_UFFD_WP);
>>   		}
>>   		return do_huge_pmd_wp_page(vmf);
>>   	}
>> -- 
>> 2.30.2
>>
>
Dev Jain Sept. 5, 2024, 9:53 a.m. UTC | #4
On 9/5/24 15:11, Kefeng Wang wrote:
>
>
> On 2024/9/5 16:26, Kirill A. Shutemov wrote:
>> On Wed, Sep 04, 2024 at 03:39:23PM +0530, Dev Jain wrote:
>>> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage 
>>> and
>>> replace it with a PMD-mapped THP. Change the helpers introduced in the
>>> previous patch to flush TLB entry corresponding to the hugezeropage,
>>> and preserve PMD uffd-wp marker. In case of failure, fallback to
>>> splitting the PMD.
>
> We met same issue, and a very similar function in our kernel,
>>>
>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>> ---
>>>   include/linux/huge_mm.h |  6 ++++
>>>   mm/huge_memory.c        | 79 
>>> +++++++++++++++++++++++++++++++++++------
>>>   mm/memory.c             |  5 +--
>>>   3 files changed, 78 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index e25d9ebfdf89..fdd2cf473a3c 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -9,6 +9,12 @@
>>>   #include <linux/kobject.h>
>>>     vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct 
>>> vm_area_struct *vma,
>>> +               unsigned long haddr, struct folio **foliop,
>>> +               unsigned long addr);
>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>> +         pgtable_t pgtable);
>>
>> Why? I don't see users outside huge_memory.c.
>>
>>>   int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>>             pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>>>             struct vm_area_struct *dst_vma, struct vm_area_struct 
>>> *src_vma);
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 58125fbcc532..150163ad77d3 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file 
>>> *filp, unsigned long addr,
>>>   }
>>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>>   -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct 
>>> vm_area_struct *vma,
>>> -                  unsigned long haddr, struct folio **foliop,
>>> -                  unsigned long addr)
>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct 
>>> vm_area_struct *vma,
>>> +               unsigned long haddr, struct folio **foliop,
>>> +               unsigned long addr)
>>>   {
>>>       struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, 
>>> true);
>>>   @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct 
>>> vm_area_struct *vma, int order)
>>>       count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>>   }
>>>   -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> -            struct vm_area_struct *vma, unsigned long haddr,
>>> -            pgtable_t pgtable)
>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>> +         pgtable_t pgtable)
>>>   {
>>> -    pmd_t entry;
>>> +    pmd_t entry, old_pmd;
>>> +    bool is_pmd_none = pmd_none(*vmf->pmd);
>>>         entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>>       entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>       folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>       folio_add_lru_vma(folio, vma);
>>> -    pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>> +    if (!is_pmd_none) {
>>> +        old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
>>> +        if (pmd_uffd_wp(old_pmd))
>>> +            entry = pmd_mkuffd_wp(entry);
>>> +    }
>>> +    if (pgtable)
>>> +        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>>       set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>       update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>       add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>> -    mm_inc_nr_ptes(vma->vm_mm);
>>> +    if (is_pmd_none)
>>> +        mm_inc_nr_ptes(vma->vm_mm);
>>>   }
>>>     static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault 
>>> *vmf)
>>> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>>       spin_unlock(vmf->ptl);
>>>   }
>>>   +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
>>> +                         unsigned long haddr,
>>> +                         struct folio *folio)
>>
>> Why the helper is needed? Cannot it be just opencodded in
>> do_huge_zero_wp_pmd()?
>>
>>> +{
>>> +    struct vm_area_struct *vma = vmf->vma;
>>> +    vm_fault_t ret = 0;
>>> +
>>> +    ret = check_stable_address_space(vma->vm_mm);
>>> +    if (ret)
>>> +        goto out;
>>> +    map_pmd_thp(folio, vmf, vma, haddr, NULL);
>>> +out:
>>> +    return ret;
>>> +}
>>> +
>>> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, 
>>> unsigned long haddr)
>>> +{
>>> +    struct vm_area_struct *vma = vmf->vma;
>>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>>> +    struct mmu_notifier_range range;
>>> +    struct folio *folio = NULL;
>>> +    vm_fault_t ret = 0;
>>> +
>>> +    ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>>> +                  vmf->address);
>>> +    if (ret)
>>> +        goto out;
>>> +
>>> +    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, 
>>> vma->vm_mm, haddr,
>>> +                haddr + HPAGE_PMD_SIZE);
>>> +    mmu_notifier_invalidate_range_start(&range);
>>> +    vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>> +    if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
>>> +        goto unlock;
>>> +    ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
>>> +    if (!ret)
>>> +        __thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>>> +unlock:
>>> +    spin_unlock(vmf->ptl);
>>> +    mmu_notifier_invalidate_range_end(&range);
>
> the folio need to released when !pmd_same() and 
> check_stable_address_space() return VM_FAULT_SIGBUS.

Yes, thanks.
>
>>> +out:
>>> +    return ret;
>>> +}
>>> +
>>>
>>
Ryan Roberts Sept. 5, 2024, 1:14 p.m. UTC | #5
On 04/09/2024 11:09, Dev Jain wrote:
> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
> replace it with a PMD-mapped THP. Change the helpers introduced in the
> previous patch to flush TLB entry corresponding to the hugezeropage,
> and preserve PMD uffd-wp marker. In case of failure, fallback to
> splitting the PMD.
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  include/linux/huge_mm.h |  6 ++++
>  mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>  mm/memory.c             |  5 +--
>  3 files changed, 78 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index e25d9ebfdf89..fdd2cf473a3c 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -9,6 +9,12 @@
>  #include <linux/kobject.h>
>  
>  vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
> +			   unsigned long haddr, struct folio **foliop,
> +			   unsigned long addr);
> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
> +		 struct vm_area_struct *vma, unsigned long haddr,
> +		 pgtable_t pgtable);

I don't think you are using either of these outside of huge_memory.c, so not
sure you need to declare them here or make them non-static?

>  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>  		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>  		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 58125fbcc532..150163ad77d3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>  }
>  EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>  
> -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
> -				  unsigned long haddr, struct folio **foliop,
> -				  unsigned long addr)
> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
> +			   unsigned long haddr, struct folio **foliop,
> +			   unsigned long addr)
>  {
>  	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>  
> @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
>  	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>  }
>  
> -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
> -			struct vm_area_struct *vma, unsigned long haddr,
> -			pgtable_t pgtable)
> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
> +		 struct vm_area_struct *vma, unsigned long haddr,
> +		 pgtable_t pgtable)
>  {
> -	pmd_t entry;
> +	pmd_t entry, old_pmd;
> +	bool is_pmd_none = pmd_none(*vmf->pmd);
>  
>  	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>  	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>  	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>  	folio_add_lru_vma(folio, vma);
> -	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
> +	if (!is_pmd_none) {
> +		old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
> +		if (pmd_uffd_wp(old_pmd))
> +			entry = pmd_mkuffd_wp(entry);

I don't really get this; entry is writable, so I wouldn't expect to also be
setting uffd-wp here? That combination is not allowed and is checked for in
page_table_check_pte_flags().

It looks like you expect to get here in the uffd-wp-async case, which used to
cause the pmd to be split to ptes. I'm guessing (but don't know for sure) that
would cause the uffd-wp bit to be set in each of the new ptes, then during
fallback to handling the wp fault on the pte, uffd would handle it?

> +	}
> +	if (pgtable)
> +		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);

Should this call be moved outside of here? It doesn't really feel like it
belongs. Could it be called before calling map_pmd_thp() for the site that has a
pgtable?

>  	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>  	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>  	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> -	mm_inc_nr_ptes(vma->vm_mm);
> +	if (is_pmd_none)
> +		mm_inc_nr_ptes(vma->vm_mm);
>  }
>  
>  static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>  	spin_unlock(vmf->ptl);
>  }
>  
> +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
> +					     unsigned long haddr,
> +					     struct folio *folio)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	vm_fault_t ret = 0;
> +
> +	ret = check_stable_address_space(vma->vm_mm);
> +	if (ret)
> +		goto out;
> +	map_pmd_thp(folio, vmf, vma, haddr, NULL);
> +out:
> +	return ret;
> +}
> +
> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	gfp_t gfp = vma_thp_gfp_mask(vma);
> +	struct mmu_notifier_range range;
> +	struct folio *folio = NULL;
> +	vm_fault_t ret = 0;
> +
> +	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
> +			      vmf->address);

Just checking: the PTE table was already allocated during the read fault, right?
So we don't have to allocate it here.

> +	if (ret)
> +		goto out;
> +
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
> +				haddr + HPAGE_PMD_SIZE);
> +	mmu_notifier_invalidate_range_start(&range);
> +	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> +	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
> +		goto unlock;
> +	ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
> +	if (!ret)
> +		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
> +unlock:
> +	spin_unlock(vmf->ptl);
> +	mmu_notifier_invalidate_range_end(&range);

I'll confess I don't understand all the mmu notifier rules. But the doc at
Documentation/mm/mmu_notifier.rst implies that the notification must be done
while holding the PTL. Although that's not how wp_page_copy(). Are you confident
what you have done is correct?

Thanks,
Ryan

> +out:
> +	return ret;
> +}
> +
>  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>  {
>  	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>  	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>  	VM_BUG_ON_VMA(!vma->anon_vma, vma);
>  
> -	if (is_huge_zero_pmd(orig_pmd))
> +	if (is_huge_zero_pmd(orig_pmd)) {
> +		vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
> +
> +		if (!(ret & VM_FAULT_FALLBACK))
> +			return ret;
> +
> +		/* Fallback to splitting PMD if THP cannot be allocated */
>  		goto fallback;
> +	}
>  
>  	spin_lock(vmf->ptl);
>  
> diff --git a/mm/memory.c b/mm/memory.c
> index 3c01d68065be..c081a25f5173 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
>  	if (vma_is_anonymous(vma)) {
>  		if (likely(!unshare) &&
>  		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
> -			if (userfaultfd_wp_async(vmf->vma))
> +			if (!userfaultfd_wp_async(vmf->vma))
> +				return handle_userfault(vmf, VM_UFFD_WP);
> +			if (!is_huge_zero_pmd(vmf->orig_pmd))
>  				goto split;
> -			return handle_userfault(vmf, VM_UFFD_WP);
>  		}
>  		return do_huge_pmd_wp_page(vmf);
>  	}
Dev Jain Sept. 6, 2024, 7:05 a.m. UTC | #6
On 9/5/24 18:44, Ryan Roberts wrote:
> On 04/09/2024 11:09, Dev Jain wrote:
>> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
>> replace it with a PMD-mapped THP. Change the helpers introduced in the
>> previous patch to flush TLB entry corresponding to the hugezeropage,
>> and preserve PMD uffd-wp marker. In case of failure, fallback to
>> splitting the PMD.
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   include/linux/huge_mm.h |  6 ++++
>>   mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>>   mm/memory.c             |  5 +--
>>   3 files changed, 78 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index e25d9ebfdf89..fdd2cf473a3c 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -9,6 +9,12 @@
>>   #include <linux/kobject.h>
>>   
>>   vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> +			   unsigned long haddr, struct folio **foliop,
>> +			   unsigned long addr);
>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +		 struct vm_area_struct *vma, unsigned long haddr,
>> +		 pgtable_t pgtable);
> I don't think you are using either of these outside of huge_memory.c, so not
> sure you need to declare them here or make them non-static?

As pointed out by Kirill, you are right, I forgot to drop these from my previous
approach.

>
>>   int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>   		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>>   		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 58125fbcc532..150163ad77d3 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>>   }
>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>   
>> -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> -				  unsigned long haddr, struct folio **foliop,
>> -				  unsigned long addr)
>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>> +			   unsigned long haddr, struct folio **foliop,
>> +			   unsigned long addr)
>>   {
>>   	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>   
>> @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
>>   	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>   }
>>   
>> -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> -			struct vm_area_struct *vma, unsigned long haddr,
>> -			pgtable_t pgtable)
>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>> +		 struct vm_area_struct *vma, unsigned long haddr,
>> +		 pgtable_t pgtable)
>>   {
>> -	pmd_t entry;
>> +	pmd_t entry, old_pmd;
>> +	bool is_pmd_none = pmd_none(*vmf->pmd);
>>   
>>   	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>   	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>   	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>   	folio_add_lru_vma(folio, vma);
>> -	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>> +	if (!is_pmd_none) {
>> +		old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
>> +		if (pmd_uffd_wp(old_pmd))
>> +			entry = pmd_mkuffd_wp(entry);
> I don't really get this; entry is writable, so I wouldn't expect to also be
> setting uffd-wp here? That combination is not allowed and is checked for in
> page_table_check_pte_flags().
>
> It looks like you expect to get here in the uffd-wp-async case, which used to
> cause the pmd to be split to ptes. I'm guessing (but don't know for sure) that
> would cause the uffd-wp bit to be set in each of the new ptes, then during
> fallback to handling the wp fault on the pte, uffd would handle it?

I guess you are correct; I missed the WARN_ON() in page_table_check_pmd_flags(),
but I did see, if I read the uffd code correctly, that mfill_atomic() will just
return in case of pmd_trans_huge(*dst_pmd) while doing a uffd_copy to the destination
location. So preserving pmd_uffd_wp is useless in case a THP is mapped, but I did not
know that in fact it is supposed to be an invalid combination. So, I will drop it,
unless someone has some other objection.

>
>> +	}
>> +	if (pgtable)
>> +		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
> Should this call be moved outside of here? It doesn't really feel like it
> belongs. Could it be called before calling map_pmd_thp() for the site that has a
> pgtable?

Every other place I checked, they are doing this: make the entry -> deposit pgtable ->
set_pmd_at(). I guess the general flow is to do the deposit based on the old pmd, before
setting the new one. Which implies: I should at least move this check before I call
pmdp_huge_clear_flush(). And, since vmf->pmd and creating the new entry has no relation,
I am inclined to do what you are saying.

>
>>   	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>   	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>   	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>> -	mm_inc_nr_ptes(vma->vm_mm);
>> +	if (is_pmd_none)
>> +		mm_inc_nr_ptes(vma->vm_mm);
>>   }
>>   
>>   static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>   	spin_unlock(vmf->ptl);
>>   }
>>   
>> +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
>> +					     unsigned long haddr,
>> +					     struct folio *folio)
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	vm_fault_t ret = 0;
>> +
>> +	ret = check_stable_address_space(vma->vm_mm);
>> +	if (ret)
>> +		goto out;
>> +	map_pmd_thp(folio, vmf, vma, haddr, NULL);
>> +out:
>> +	return ret;
>> +}
>> +
>> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	gfp_t gfp = vma_thp_gfp_mask(vma);
>> +	struct mmu_notifier_range range;
>> +	struct folio *folio = NULL;
>> +	vm_fault_t ret = 0;
>> +
>> +	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>> +			      vmf->address);
> Just checking: the PTE table was already allocated during the read fault, right?
> So we don't have to allocate it here.

Correct, that happens in set_huge_zero_folio(). Thanks for checking.

>
>> +	if (ret)
>> +		goto out;
>> +
>> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
>> +				haddr + HPAGE_PMD_SIZE);
>> +	mmu_notifier_invalidate_range_start(&range);
>> +	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>> +	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
>> +		goto unlock;
>> +	ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
>> +	if (!ret)
>> +		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>> +unlock:
>> +	spin_unlock(vmf->ptl);
>> +	mmu_notifier_invalidate_range_end(&range);
> I'll confess I don't understand all the mmu notifier rules.

I confess the same :)

> But the doc at
> Documentation/mm/mmu_notifier.rst implies that the notification must be done
> while holding the PTL. Although that's not how wp_page_copy(). Are you confident
> what you have done is correct?

Everywhere else, invalidate_range_end() is getting called after dropping the lock,
one reason is that it has a might_sleep(), and therefore we cannot call it while
holding a spinlock. I still don't know what exactly these calls mean...but I think
what I have done is correct.

>
> Thanks,
> Ryan
>
>> +out:
>> +	return ret;
>> +}
>> +
>>   vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>   {
>>   	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
>> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>   	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>>   	VM_BUG_ON_VMA(!vma->anon_vma, vma);
>>   
>> -	if (is_huge_zero_pmd(orig_pmd))
>> +	if (is_huge_zero_pmd(orig_pmd)) {
>> +		vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
>> +
>> +		if (!(ret & VM_FAULT_FALLBACK))
>> +			return ret;
>> +
>> +		/* Fallback to splitting PMD if THP cannot be allocated */
>>   		goto fallback;
>> +	}
>>   
>>   	spin_lock(vmf->ptl);
>>   
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 3c01d68065be..c081a25f5173 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
>>   	if (vma_is_anonymous(vma)) {
>>   		if (likely(!unshare) &&
>>   		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
>> -			if (userfaultfd_wp_async(vmf->vma))
>> +			if (!userfaultfd_wp_async(vmf->vma))
>> +				return handle_userfault(vmf, VM_UFFD_WP);
>> +			if (!is_huge_zero_pmd(vmf->orig_pmd))
>>   				goto split;
>> -			return handle_userfault(vmf, VM_UFFD_WP);
>>   		}
>>   		return do_huge_pmd_wp_page(vmf);
>>   	}
Ryan Roberts Sept. 6, 2024, 8:43 a.m. UTC | #7
On 06/09/2024 08:05, Dev Jain wrote:
> 
> On 9/5/24 18:44, Ryan Roberts wrote:
>> On 04/09/2024 11:09, Dev Jain wrote:
>>> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
>>> replace it with a PMD-mapped THP. Change the helpers introduced in the
>>> previous patch to flush TLB entry corresponding to the hugezeropage,
>>> and preserve PMD uffd-wp marker. In case of failure, fallback to
>>> splitting the PMD.
>>>
>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>> ---
>>>   include/linux/huge_mm.h |  6 ++++
>>>   mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>>>   mm/memory.c             |  5 +--
>>>   3 files changed, 78 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index e25d9ebfdf89..fdd2cf473a3c 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -9,6 +9,12 @@
>>>   #include <linux/kobject.h>
>>>     vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>>> +               unsigned long haddr, struct folio **foliop,
>>> +               unsigned long addr);
>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>> +         pgtable_t pgtable);
>> I don't think you are using either of these outside of huge_memory.c, so not
>> sure you need to declare them here or make them non-static?
> 
> As pointed out by Kirill, you are right, I forgot to drop these from my previous
> approach.
> 
>>
>>>   int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>>             pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>>>             struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 58125fbcc532..150163ad77d3 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp,
>>> unsigned long addr,
>>>   }
>>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>>   -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct
>>> vm_area_struct *vma,
>>> -                  unsigned long haddr, struct folio **foliop,
>>> -                  unsigned long addr)
>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>>> +               unsigned long haddr, struct folio **foliop,
>>> +               unsigned long addr)
>>>   {
>>>       struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>>   @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct
>>> vm_area_struct *vma, int order)
>>>       count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>>   }
>>>   -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> -            struct vm_area_struct *vma, unsigned long haddr,
>>> -            pgtable_t pgtable)
>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>> +         pgtable_t pgtable)
>>>   {
>>> -    pmd_t entry;
>>> +    pmd_t entry, old_pmd;
>>> +    bool is_pmd_none = pmd_none(*vmf->pmd);
>>>         entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>>       entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>       folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>       folio_add_lru_vma(folio, vma);
>>> -    pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>> +    if (!is_pmd_none) {
>>> +        old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
>>> +        if (pmd_uffd_wp(old_pmd))
>>> +            entry = pmd_mkuffd_wp(entry);
>> I don't really get this; entry is writable, so I wouldn't expect to also be
>> setting uffd-wp here? That combination is not allowed and is checked for in
>> page_table_check_pte_flags().
>>
>> It looks like you expect to get here in the uffd-wp-async case, which used to
>> cause the pmd to be split to ptes. I'm guessing (but don't know for sure) that
>> would cause the uffd-wp bit to be set in each of the new ptes, then during
>> fallback to handling the wp fault on the pte, uffd would handle it?
> 
> I guess you are correct; I missed the WARN_ON() in page_table_check_pmd_flags(),
> but I did see, if I read the uffd code correctly, that mfill_atomic() will just
> return in case of pmd_trans_huge(*dst_pmd) while doing a uffd_copy to the
> destination
> location. So preserving pmd_uffd_wp is useless in case a THP is mapped, but I
> did not
> know that in fact it is supposed to be an invalid combination. So, I will drop it,
> unless someone has some other objection.

So what's the correct way to handle uffd-wp-async in wp_huge_pmd()? Just split
it? If so, you can revert your changes in memory.c.

> 
>>
>>> +    }
>>> +    if (pgtable)
>>> +        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>> Should this call be moved outside of here? It doesn't really feel like it
>> belongs. Could it be called before calling map_pmd_thp() for the site that has a
>> pgtable?
> 
> Every other place I checked, they are doing this: make the entry -> deposit
> pgtable ->
> set_pmd_at(). I guess the general flow is to do the deposit based on the old
> pmd, before
> setting the new one. Which implies: I should at least move this check before I call
> pmdp_huge_clear_flush(). And, since vmf->pmd and creating the new entry has no
> relation,
> I am inclined to do what you are saying.

pgtable_trans_huge_deposit() is just adding the pgtable to a list so that if the
THP needs to be split in future, then we have preallocated the pte pgtable so
the operation can't fail. And enqueing it is just under the protection of the
PTL as far as I can tell. So I think the only ordering requirement is that you
both set the pmd and deposit the pgtable under the lock (without dropping it in
between). So you can deposit either before or after map_pmd_thp(). And
pmdp_huge_clear_flush() is irrelavent, I think?

> 
>>
>>>       set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>       update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>       add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>> -    mm_inc_nr_ptes(vma->vm_mm);
>>> +    if (is_pmd_none)
>>> +        mm_inc_nr_ptes(vma->vm_mm);
>>>   }
>>>     static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>>       spin_unlock(vmf->ptl);
>>>   }
>>>   +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
>>> +                         unsigned long haddr,
>>> +                         struct folio *folio)
>>> +{
>>> +    struct vm_area_struct *vma = vmf->vma;
>>> +    vm_fault_t ret = 0;
>>> +
>>> +    ret = check_stable_address_space(vma->vm_mm);
>>> +    if (ret)
>>> +        goto out;
>>> +    map_pmd_thp(folio, vmf, vma, haddr, NULL);
>>> +out:
>>> +    return ret;
>>> +}
>>> +
>>> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long
>>> haddr)
>>> +{
>>> +    struct vm_area_struct *vma = vmf->vma;
>>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>>> +    struct mmu_notifier_range range;
>>> +    struct folio *folio = NULL;
>>> +    vm_fault_t ret = 0;
>>> +
>>> +    ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>>> +                  vmf->address);
>> Just checking: the PTE table was already allocated during the read fault, right?
>> So we don't have to allocate it here.
> 
> Correct, that happens in set_huge_zero_folio(). Thanks for checking.
> 
>>
>>> +    if (ret)
>>> +        goto out;
>>> +
>>> +    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
>>> +                haddr + HPAGE_PMD_SIZE);
>>> +    mmu_notifier_invalidate_range_start(&range);
>>> +    vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>> +    if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
>>> +        goto unlock;
>>> +    ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
>>> +    if (!ret)
>>> +        __thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>>> +unlock:
>>> +    spin_unlock(vmf->ptl);
>>> +    mmu_notifier_invalidate_range_end(&range);
>> I'll confess I don't understand all the mmu notifier rules.
> 
> I confess the same :)
> 
>> But the doc at
>> Documentation/mm/mmu_notifier.rst implies that the notification must be done
>> while holding the PTL. Although that's not how wp_page_copy(). Are you confident
>> what you have done is correct?
> 
> Everywhere else, invalidate_range_end() is getting called after dropping the lock,
> one reason is that it has a might_sleep(), and therefore we cannot call it while
> holding a spinlock. I still don't know what exactly these calls mean...but I think
> what I have done is correct.

High level; they are notifying secondary MMUs (e.g. IOMMU) of a change so the
tables of those secondary MMUs can be kept in sync. I don't understand all the
ordering requirement details though.

I think what you have is probably correct, would be good for someone that knows
what they are talking about to confirm though :)

> 
>>
>> Thanks,
>> Ryan
>>
>>> +out:
>>> +    return ret;
>>> +}
>>> +
>>>   vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>>   {
>>>       const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
>>> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>>       vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>>>       VM_BUG_ON_VMA(!vma->anon_vma, vma);
>>>   -    if (is_huge_zero_pmd(orig_pmd))
>>> +    if (is_huge_zero_pmd(orig_pmd)) {
>>> +        vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
>>> +
>>> +        if (!(ret & VM_FAULT_FALLBACK))
>>> +            return ret;
>>> +
>>> +        /* Fallback to splitting PMD if THP cannot be allocated */
>>>           goto fallback;
>>> +    }
>>>         spin_lock(vmf->ptl);
>>>   diff --git a/mm/memory.c b/mm/memory.c
>>> index 3c01d68065be..c081a25f5173 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault
>>> *vmf)
>>>       if (vma_is_anonymous(vma)) {
>>>           if (likely(!unshare) &&
>>>               userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
>>> -            if (userfaultfd_wp_async(vmf->vma))
>>> +            if (!userfaultfd_wp_async(vmf->vma))
>>> +                return handle_userfault(vmf, VM_UFFD_WP);
>>> +            if (!is_huge_zero_pmd(vmf->orig_pmd))
>>>                   goto split;
>>> -            return handle_userfault(vmf, VM_UFFD_WP);
>>>           }
>>>           return do_huge_pmd_wp_page(vmf);
>>>       }
Dev Jain Sept. 6, 2024, 9 a.m. UTC | #8
On 9/6/24 14:13, Ryan Roberts wrote:
> On 06/09/2024 08:05, Dev Jain wrote:
>> On 9/5/24 18:44, Ryan Roberts wrote:
>>> On 04/09/2024 11:09, Dev Jain wrote:
>>>> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
>>>> replace it with a PMD-mapped THP. Change the helpers introduced in the
>>>> previous patch to flush TLB entry corresponding to the hugezeropage,
>>>> and preserve PMD uffd-wp marker. In case of failure, fallback to
>>>> splitting the PMD.
>>>>
>>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>>> ---
>>>>    include/linux/huge_mm.h |  6 ++++
>>>>    mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>>>>    mm/memory.c             |  5 +--
>>>>    3 files changed, 78 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index e25d9ebfdf89..fdd2cf473a3c 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -9,6 +9,12 @@
>>>>    #include <linux/kobject.h>
>>>>      vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
>>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>>>> +               unsigned long haddr, struct folio **foliop,
>>>> +               unsigned long addr);
>>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>>> +         pgtable_t pgtable);
>>> I don't think you are using either of these outside of huge_memory.c, so not
>>> sure you need to declare them here or make them non-static?
>> As pointed out by Kirill, you are right, I forgot to drop these from my previous
>> approach.
>>
>>>>    int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>>>              pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>>>>              struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 58125fbcc532..150163ad77d3 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp,
>>>> unsigned long addr,
>>>>    }
>>>>    EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>>>    -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct
>>>> vm_area_struct *vma,
>>>> -                  unsigned long haddr, struct folio **foliop,
>>>> -                  unsigned long addr)
>>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>>>> +               unsigned long haddr, struct folio **foliop,
>>>> +               unsigned long addr)
>>>>    {
>>>>        struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>>>    @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct
>>>> vm_area_struct *vma, int order)
>>>>        count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>>>    }
>>>>    -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>> -            struct vm_area_struct *vma, unsigned long haddr,
>>>> -            pgtable_t pgtable)
>>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>>> +         pgtable_t pgtable)
>>>>    {
>>>> -    pmd_t entry;
>>>> +    pmd_t entry, old_pmd;
>>>> +    bool is_pmd_none = pmd_none(*vmf->pmd);
>>>>          entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>>>        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>>        folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>>        folio_add_lru_vma(folio, vma);
>>>> -    pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>>> +    if (!is_pmd_none) {
>>>> +        old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
>>>> +        if (pmd_uffd_wp(old_pmd))
>>>> +            entry = pmd_mkuffd_wp(entry);
>>> I don't really get this; entry is writable, so I wouldn't expect to also be
>>> setting uffd-wp here? That combination is not allowed and is checked for in
>>> page_table_check_pte_flags().
>>>
>>> It looks like you expect to get here in the uffd-wp-async case, which used to
>>> cause the pmd to be split to ptes. I'm guessing (but don't know for sure) that
>>> would cause the uffd-wp bit to be set in each of the new ptes, then during
>>> fallback to handling the wp fault on the pte, uffd would handle it?
>> I guess you are correct; I missed the WARN_ON() in page_table_check_pmd_flags(),
>> but I did see, if I read the uffd code correctly, that mfill_atomic() will just
>> return in case of pmd_trans_huge(*dst_pmd) while doing a uffd_copy to the
>> destination
>> location. So preserving pmd_uffd_wp is useless in case a THP is mapped, but I
>> did not
>> know that in fact it is supposed to be an invalid combination. So, I will drop it,
>> unless someone has some other objection.
> So what's the correct way to handle uffd-wp-async in wp_huge_pmd()? Just split
> it? If so, you can revert your changes in memory.c.

I think so.

>
>>>> +    }
>>>> +    if (pgtable)
>>>> +        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>> Should this call be moved outside of here? It doesn't really feel like it
>>> belongs. Could it be called before calling map_pmd_thp() for the site that has a
>>> pgtable?
>> Every other place I checked, they are doing this: make the entry -> deposit
>> pgtable ->
>> set_pmd_at(). I guess the general flow is to do the deposit based on the old
>> pmd, before
>> setting the new one. Which implies: I should at least move this check before I call
>> pmdp_huge_clear_flush(). And, since vmf->pmd and creating the new entry has no
>> relation,
>> I am inclined to do what you are saying.
> pgtable_trans_huge_deposit() is just adding the pgtable to a list so that if the
> THP needs to be split in future, then we have preallocated the pte pgtable so
> the operation can't fail.

Yes.

> And enqueing it is just under the protection of the
> PTL as far as I can tell. So I think the only ordering requirement is that you
> both set the pmd and deposit the pgtable under the lock (without dropping it in
> between). So you can deposit either before or after map_pmd_thp().

Yes I'll do that before.

> And
> pmdp_huge_clear_flush() is irrelavent, I think?

You mean, in this context? Everywhere, pgtable deposit uses the old pmd
value to be replaced as its input, that is, it is called before set_pmd_at().
So calling pgtable deposit after clear_flush() will violate this ordering.
I do not think this ordering is really required but I'd rather be safe :)

>
>>>>        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>>        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>>        add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>>> -    mm_inc_nr_ptes(vma->vm_mm);
>>>> +    if (is_pmd_none)
>>>> +        mm_inc_nr_ptes(vma->vm_mm);
>>>>    }
>>>>      static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>>> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>>>        spin_unlock(vmf->ptl);
>>>>    }
>>>>    +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
>>>> +                         unsigned long haddr,
>>>> +                         struct folio *folio)
>>>> +{
>>>> +    struct vm_area_struct *vma = vmf->vma;
>>>> +    vm_fault_t ret = 0;
>>>> +
>>>> +    ret = check_stable_address_space(vma->vm_mm);
>>>> +    if (ret)
>>>> +        goto out;
>>>> +    map_pmd_thp(folio, vmf, vma, haddr, NULL);
>>>> +out:
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long
>>>> haddr)
>>>> +{
>>>> +    struct vm_area_struct *vma = vmf->vma;
>>>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>>>> +    struct mmu_notifier_range range;
>>>> +    struct folio *folio = NULL;
>>>> +    vm_fault_t ret = 0;
>>>> +
>>>> +    ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>>>> +                  vmf->address);
>>> Just checking: the PTE table was already allocated during the read fault, right?
>>> So we don't have to allocate it here.
>> Correct, that happens in set_huge_zero_folio(). Thanks for checking.
>>
>>>> +    if (ret)
>>>> +        goto out;
>>>> +
>>>> +    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
>>>> +                haddr + HPAGE_PMD_SIZE);
>>>> +    mmu_notifier_invalidate_range_start(&range);
>>>> +    vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>>> +    if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
>>>> +        goto unlock;
>>>> +    ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
>>>> +    if (!ret)
>>>> +        __thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>>>> +unlock:
>>>> +    spin_unlock(vmf->ptl);
>>>> +    mmu_notifier_invalidate_range_end(&range);
>>> I'll confess I don't understand all the mmu notifier rules.
>> I confess the same :)
>>
>>> But the doc at
>>> Documentation/mm/mmu_notifier.rst implies that the notification must be done
>>> while holding the PTL. Although that's not how wp_page_copy(). Are you confident
>>> what you have done is correct?
>> Everywhere else, invalidate_range_end() is getting called after dropping the lock,
>> one reason is that it has a might_sleep(), and therefore we cannot call it while
>> holding a spinlock. I still don't know what exactly these calls mean...but I think
>> what I have done is correct.
> High level; they are notifying secondary MMUs (e.g. IOMMU) of a change so the
> tables of those secondary MMUs can be kept in sync. I don't understand all the
> ordering requirement details though.
>
> I think what you have is probably correct, would be good for someone that knows
> what they are talking about to confirm though :)

Exactly.

>
>>> Thanks,
>>> Ryan
>>>
>>>> +out:
>>>> +    return ret;
>>>> +}
>>>> +
>>>>    vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>>>    {
>>>>        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
>>>> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>>>        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>>>>        VM_BUG_ON_VMA(!vma->anon_vma, vma);
>>>>    -    if (is_huge_zero_pmd(orig_pmd))
>>>> +    if (is_huge_zero_pmd(orig_pmd)) {
>>>> +        vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
>>>> +
>>>> +        if (!(ret & VM_FAULT_FALLBACK))
>>>> +            return ret;
>>>> +
>>>> +        /* Fallback to splitting PMD if THP cannot be allocated */
>>>>            goto fallback;
>>>> +    }
>>>>          spin_lock(vmf->ptl);
>>>>    diff --git a/mm/memory.c b/mm/memory.c
>>>> index 3c01d68065be..c081a25f5173 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault
>>>> *vmf)
>>>>        if (vma_is_anonymous(vma)) {
>>>>            if (likely(!unshare) &&
>>>>                userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
>>>> -            if (userfaultfd_wp_async(vmf->vma))
>>>> +            if (!userfaultfd_wp_async(vmf->vma))
>>>> +                return handle_userfault(vmf, VM_UFFD_WP);
>>>> +            if (!is_huge_zero_pmd(vmf->orig_pmd))
>>>>                    goto split;
>>>> -            return handle_userfault(vmf, VM_UFFD_WP);
>>>>            }
>>>>            return do_huge_pmd_wp_page(vmf);
>>>>        }
Ryan Roberts Sept. 6, 2024, 9:31 a.m. UTC | #9
On 06/09/2024 10:00, Dev Jain wrote:
> 
> On 9/6/24 14:13, Ryan Roberts wrote:
>> On 06/09/2024 08:05, Dev Jain wrote:
>>> On 9/5/24 18:44, Ryan Roberts wrote:
>>>> On 04/09/2024 11:09, Dev Jain wrote:
>>>>> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
>>>>> replace it with a PMD-mapped THP. Change the helpers introduced in the
>>>>> previous patch to flush TLB entry corresponding to the hugezeropage,
>>>>> and preserve PMD uffd-wp marker. In case of failure, fallback to
>>>>> splitting the PMD.
>>>>>
>>>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>>>> ---
>>>>>    include/linux/huge_mm.h |  6 ++++
>>>>>    mm/huge_memory.c        | 79 +++++++++++++++++++++++++++++++++++------
>>>>>    mm/memory.c             |  5 +--
>>>>>    3 files changed, 78 insertions(+), 12 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>>> index e25d9ebfdf89..fdd2cf473a3c 100644
>>>>> --- a/include/linux/huge_mm.h
>>>>> +++ b/include/linux/huge_mm.h
>>>>> @@ -9,6 +9,12 @@
>>>>>    #include <linux/kobject.h>
>>>>>      vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
>>>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>>>>> +               unsigned long haddr, struct folio **foliop,
>>>>> +               unsigned long addr);
>>>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>>>> +         pgtable_t pgtable);
>>>> I don't think you are using either of these outside of huge_memory.c, so not
>>>> sure you need to declare them here or make them non-static?
>>> As pointed out by Kirill, you are right, I forgot to drop these from my previous
>>> approach.
>>>
>>>>>    int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
>>>>>              pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
>>>>>              struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>> index 58125fbcc532..150163ad77d3 100644
>>>>> --- a/mm/huge_memory.c
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -943,9 +943,9 @@ unsigned long thp_get_unmapped_area(struct file *filp,
>>>>> unsigned long addr,
>>>>>    }
>>>>>    EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>>>>    -static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct
>>>>> vm_area_struct *vma,
>>>>> -                  unsigned long haddr, struct folio **foliop,
>>>>> -                  unsigned long addr)
>>>>> +vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
>>>>> +               unsigned long haddr, struct folio **foliop,
>>>>> +               unsigned long addr)
>>>>>    {
>>>>>        struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
>>>>>    @@ -984,21 +984,29 @@ static void __thp_fault_success_stats(struct
>>>>> vm_area_struct *vma, int order)
>>>>>        count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>>>>    }
>>>>>    -static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>>> -            struct vm_area_struct *vma, unsigned long haddr,
>>>>> -            pgtable_t pgtable)
>>>>> +void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>>>> +         struct vm_area_struct *vma, unsigned long haddr,
>>>>> +         pgtable_t pgtable)
>>>>>    {
>>>>> -    pmd_t entry;
>>>>> +    pmd_t entry, old_pmd;
>>>>> +    bool is_pmd_none = pmd_none(*vmf->pmd);
>>>>>          entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>>>>        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>>>>        folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>>>>        folio_add_lru_vma(folio, vma);
>>>>> -    pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>>>> +    if (!is_pmd_none) {
>>>>> +        old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
>>>>> +        if (pmd_uffd_wp(old_pmd))
>>>>> +            entry = pmd_mkuffd_wp(entry);
>>>> I don't really get this; entry is writable, so I wouldn't expect to also be
>>>> setting uffd-wp here? That combination is not allowed and is checked for in
>>>> page_table_check_pte_flags().
>>>>
>>>> It looks like you expect to get here in the uffd-wp-async case, which used to
>>>> cause the pmd to be split to ptes. I'm guessing (but don't know for sure) that
>>>> would cause the uffd-wp bit to be set in each of the new ptes, then during
>>>> fallback to handling the wp fault on the pte, uffd would handle it?
>>> I guess you are correct; I missed the WARN_ON() in page_table_check_pmd_flags(),
>>> but I did see, if I read the uffd code correctly, that mfill_atomic() will just
>>> return in case of pmd_trans_huge(*dst_pmd) while doing a uffd_copy to the
>>> destination
>>> location. So preserving pmd_uffd_wp is useless in case a THP is mapped, but I
>>> did not
>>> know that in fact it is supposed to be an invalid combination. So, I will
>>> drop it,
>>> unless someone has some other objection.
>> So what's the correct way to handle uffd-wp-async in wp_huge_pmd()? Just split
>> it? If so, you can revert your changes in memory.c.
> 
> I think so.
> 
>>
>>>>> +    }
>>>>> +    if (pgtable)
>>>>> +        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>>>> Should this call be moved outside of here? It doesn't really feel like it
>>>> belongs. Could it be called before calling map_pmd_thp() for the site that
>>>> has a
>>>> pgtable?
>>> Every other place I checked, they are doing this: make the entry -> deposit
>>> pgtable ->
>>> set_pmd_at(). I guess the general flow is to do the deposit based on the old
>>> pmd, before
>>> setting the new one. Which implies: I should at least move this check before
>>> I call
>>> pmdp_huge_clear_flush(). And, since vmf->pmd and creating the new entry has no
>>> relation,
>>> I am inclined to do what you are saying.
>> pgtable_trans_huge_deposit() is just adding the pgtable to a list so that if the
>> THP needs to be split in future, then we have preallocated the pte pgtable so
>> the operation can't fail.
> 
> Yes.
> 
>> And enqueing it is just under the protection of the
>> PTL as far as I can tell. So I think the only ordering requirement is that you
>> both set the pmd and deposit the pgtable under the lock (without dropping it in
>> between). So you can deposit either before or after map_pmd_thp().
> 
> Yes I'll do that before.
> 
>> And
>> pmdp_huge_clear_flush() is irrelavent, I think?
> 
> You mean, in this context? Everywhere, pgtable deposit uses the old pmd
> value to be replaced as its input, that is, it is called before set_pmd_at().
> So calling pgtable deposit after clear_flush() will violate this ordering.
> I do not think this ordering is really required but I'd rather be safe :)

The pmd pointer is just used to get the pmd table (the pointer points to an
entry inside the table so its just a case of backwards aligning the pointer).
The pointer is never dereferenced, so the value of the entry is irrelevant.

> 
>>
>>>>>        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>>>>        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>>>>        add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>>>>> -    mm_inc_nr_ptes(vma->vm_mm);
>>>>> +    if (is_pmd_none)
>>>>> +        mm_inc_nr_ptes(vma->vm_mm);
>>>>>    }
>>>>>      static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>>>> @@ -1576,6 +1584,50 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>>>>        spin_unlock(vmf->ptl);
>>>>>    }
>>>>>    +static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
>>>>> +                         unsigned long haddr,
>>>>> +                         struct folio *folio)
>>>>> +{
>>>>> +    struct vm_area_struct *vma = vmf->vma;
>>>>> +    vm_fault_t ret = 0;
>>>>> +
>>>>> +    ret = check_stable_address_space(vma->vm_mm);
>>>>> +    if (ret)
>>>>> +        goto out;
>>>>> +    map_pmd_thp(folio, vmf, vma, haddr, NULL);
>>>>> +out:
>>>>> +    return ret;
>>>>> +}
>>>>> +
>>>>> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long
>>>>> haddr)
>>>>> +{
>>>>> +    struct vm_area_struct *vma = vmf->vma;
>>>>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>>>>> +    struct mmu_notifier_range range;
>>>>> +    struct folio *folio = NULL;
>>>>> +    vm_fault_t ret = 0;
>>>>> +
>>>>> +    ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
>>>>> +                  vmf->address);
>>>> Just checking: the PTE table was already allocated during the read fault,
>>>> right?
>>>> So we don't have to allocate it here.
>>> Correct, that happens in set_huge_zero_folio(). Thanks for checking.
>>>
>>>>> +    if (ret)
>>>>> +        goto out;
>>>>> +
>>>>> +    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
>>>>> +                haddr + HPAGE_PMD_SIZE);
>>>>> +    mmu_notifier_invalidate_range_start(&range);
>>>>> +    vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>>>> +    if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
>>>>> +        goto unlock;
>>>>> +    ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
>>>>> +    if (!ret)
>>>>> +        __thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
>>>>> +unlock:
>>>>> +    spin_unlock(vmf->ptl);
>>>>> +    mmu_notifier_invalidate_range_end(&range);
>>>> I'll confess I don't understand all the mmu notifier rules.
>>> I confess the same :)
>>>
>>>> But the doc at
>>>> Documentation/mm/mmu_notifier.rst implies that the notification must be done
>>>> while holding the PTL. Although that's not how wp_page_copy(). Are you
>>>> confident
>>>> what you have done is correct?
>>> Everywhere else, invalidate_range_end() is getting called after dropping the
>>> lock,
>>> one reason is that it has a might_sleep(), and therefore we cannot call it while
>>> holding a spinlock. I still don't know what exactly these calls mean...but I
>>> think
>>> what I have done is correct.
>> High level; they are notifying secondary MMUs (e.g. IOMMU) of a change so the
>> tables of those secondary MMUs can be kept in sync. I don't understand all the
>> ordering requirement details though.
>>
>> I think what you have is probably correct, would be good for someone that knows
>> what they are talking about to confirm though :)
> 
> Exactly.
> 
>>
>>>> Thanks,
>>>> Ryan
>>>>
>>>>> +out:
>>>>> +    return ret;
>>>>> +}
>>>>> +
>>>>>    vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>>>>    {
>>>>>        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
>>>>> @@ -1588,8 +1640,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
>>>>>        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
>>>>>        VM_BUG_ON_VMA(!vma->anon_vma, vma);
>>>>>    -    if (is_huge_zero_pmd(orig_pmd))
>>>>> +    if (is_huge_zero_pmd(orig_pmd)) {
>>>>> +        vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
>>>>> +
>>>>> +        if (!(ret & VM_FAULT_FALLBACK))
>>>>> +            return ret;
>>>>> +
>>>>> +        /* Fallback to splitting PMD if THP cannot be allocated */
>>>>>            goto fallback;
>>>>> +    }
>>>>>          spin_lock(vmf->ptl);
>>>>>    diff --git a/mm/memory.c b/mm/memory.c
>>>>> index 3c01d68065be..c081a25f5173 100644
>>>>> --- a/mm/memory.c
>>>>> +++ b/mm/memory.c
>>>>> @@ -5409,9 +5409,10 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault
>>>>> *vmf)
>>>>>        if (vma_is_anonymous(vma)) {
>>>>>            if (likely(!unshare) &&
>>>>>                userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
>>>>> -            if (userfaultfd_wp_async(vmf->vma))
>>>>> +            if (!userfaultfd_wp_async(vmf->vma))
>>>>> +                return handle_userfault(vmf, VM_UFFD_WP);
>>>>> +            if (!is_huge_zero_pmd(vmf->orig_pmd))
>>>>>                    goto split;
>>>>> -            return handle_userfault(vmf, VM_UFFD_WP);
>>>>>            }
>>>>>            return do_huge_pmd_wp_page(vmf);
>>>>>        }
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e25d9ebfdf89..fdd2cf473a3c 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -9,6 +9,12 @@ 
 #include <linux/kobject.h>
 
 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
+vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
+			   unsigned long haddr, struct folio **foliop,
+			   unsigned long addr);
+void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
+		 struct vm_area_struct *vma, unsigned long haddr,
+		 pgtable_t pgtable);
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 		  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 58125fbcc532..150163ad77d3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -943,9 +943,9 @@  unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 
-static vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
-				  unsigned long haddr, struct folio **foliop,
-				  unsigned long addr)
+vm_fault_t thp_fault_alloc(gfp_t gfp, int order, struct vm_area_struct *vma,
+			   unsigned long haddr, struct folio **foliop,
+			   unsigned long addr)
 {
 	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
 
@@ -984,21 +984,29 @@  static void __thp_fault_success_stats(struct vm_area_struct *vma, int order)
 	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
 }
 
-static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
-			struct vm_area_struct *vma, unsigned long haddr,
-			pgtable_t pgtable)
+void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
+		 struct vm_area_struct *vma, unsigned long haddr,
+		 pgtable_t pgtable)
 {
-	pmd_t entry;
+	pmd_t entry, old_pmd;
+	bool is_pmd_none = pmd_none(*vmf->pmd);
 
 	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
 	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
 	folio_add_lru_vma(folio, vma);
-	pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+	if (!is_pmd_none) {
+		old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
+		if (pmd_uffd_wp(old_pmd))
+			entry = pmd_mkuffd_wp(entry);
+	}
+	if (pgtable)
+		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
 	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
 	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-	mm_inc_nr_ptes(vma->vm_mm);
+	if (is_pmd_none)
+		mm_inc_nr_ptes(vma->vm_mm);
 }
 
 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
@@ -1576,6 +1584,50 @@  void huge_pmd_set_accessed(struct vm_fault *vmf)
 	spin_unlock(vmf->ptl);
 }
 
+static vm_fault_t do_huge_zero_wp_pmd_locked(struct vm_fault *vmf,
+					     unsigned long haddr,
+					     struct folio *folio)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	vm_fault_t ret = 0;
+
+	ret = check_stable_address_space(vma->vm_mm);
+	if (ret)
+		goto out;
+	map_pmd_thp(folio, vmf, vma, haddr, NULL);
+out:
+	return ret;
+}
+
+static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	gfp_t gfp = vma_thp_gfp_mask(vma);
+	struct mmu_notifier_range range;
+	struct folio *folio = NULL;
+	vm_fault_t ret = 0;
+
+	ret = thp_fault_alloc(gfp, HPAGE_PMD_ORDER, vma, haddr, &folio,
+			      vmf->address);
+	if (ret)
+		goto out;
+
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
+				haddr + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
+		goto unlock;
+	ret = do_huge_zero_wp_pmd_locked(vmf, haddr, folio);
+	if (!ret)
+		__thp_fault_success_stats(vma, HPAGE_PMD_ORDER);
+unlock:
+	spin_unlock(vmf->ptl);
+	mmu_notifier_invalidate_range_end(&range);
+out:
+	return ret;
+}
+
 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
 {
 	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
@@ -1588,8 +1640,15 @@  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
 	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
 	VM_BUG_ON_VMA(!vma->anon_vma, vma);
 
-	if (is_huge_zero_pmd(orig_pmd))
+	if (is_huge_zero_pmd(orig_pmd)) {
+		vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
+
+		if (!(ret & VM_FAULT_FALLBACK))
+			return ret;
+
+		/* Fallback to splitting PMD if THP cannot be allocated */
 		goto fallback;
+	}
 
 	spin_lock(vmf->ptl);
 
diff --git a/mm/memory.c b/mm/memory.c
index 3c01d68065be..c081a25f5173 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5409,9 +5409,10 @@  static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
 	if (vma_is_anonymous(vma)) {
 		if (likely(!unshare) &&
 		    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
-			if (userfaultfd_wp_async(vmf->vma))
+			if (!userfaultfd_wp_async(vmf->vma))
+				return handle_userfault(vmf, VM_UFFD_WP);
+			if (!is_huge_zero_pmd(vmf->orig_pmd))
 				goto split;
-			return handle_userfault(vmf, VM_UFFD_WP);
 		}
 		return do_huge_pmd_wp_page(vmf);
 	}