diff mbox series

[v3,2/2] mm: Allocate THP on hugezeropage wp-fault

Message ID 20240911065600.1002644-3-dev.jain@arm.com (mailing list archive)
State New, archived
Headers show
Series Do not shatter hugezeropage on wp-fault | expand

Commit Message

Dev Jain Sept. 11, 2024, 6:56 a.m. UTC
Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
replace it with a PMD-mapped THP. Change the helper introduced in the
previous patch to flush TLB entry corresponding to the hugezeropage.
In case of failure, fallback to splitting the PMD.

Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 mm/huge_memory.c | 52 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 3 deletions(-)

Comments

David Hildenbrand Sept. 11, 2024, 9:36 a.m. UTC | #1
On 11.09.24 08:56, Dev Jain wrote:
> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
> replace it with a PMD-mapped THP. Change the helper introduced in the
> previous patch to flush TLB entry corresponding to the hugezeropage.
> In case of failure, fallback to splitting the PMD.
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>   mm/huge_memory.c | 52 +++++++++++++++++++++++++++++++++++++++++++++---
>   1 file changed, 49 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index b96a1ff2bf40..3e28946a805f 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -987,16 +987,20 @@ static void __pmd_thp_fault_success_stats(struct vm_area_struct *vma)
>   static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>   			struct vm_area_struct *vma, unsigned long haddr)
>   {
> -	pmd_t entry;
> +	pmd_t entry, old_pmd;
> +	bool is_pmd_none = pmd_none(*vmf->pmd);
>   
>   	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>   	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>   	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>   	folio_add_lru_vma(folio, vma);
> +	if (!is_pmd_none)
> +		old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);

This should likely be done in the caller.

>   	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>   	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>   	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> -	mm_inc_nr_ptes(vma->vm_mm);
> +	if (is_pmd_none)
> +		mm_inc_nr_ptes(vma->vm_mm);

And this as well.

No need to make this function deal with this if the callers exactly know 
what they are doing.

>   }
>   
>   static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> @@ -1576,6 +1580,41 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>   	spin_unlock(vmf->ptl);
>   }
>   
> +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)

Is there a need to pass in "haddr" if we have the vmf?

> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	gfp_t gfp = vma_thp_gfp_mask(vma);
> +	struct mmu_notifier_range range;
> +	struct folio *folio;
> +	vm_fault_t ret = 0;
> +
> +	folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address);
> +	if (unlikely(!folio)) {
> +		ret = VM_FAULT_FALLBACK;
> +		goto out;
> +	}
> +
> +	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
> +				haddr + HPAGE_PMD_SIZE);
> +	mmu_notifier_invalidate_range_start(&range);
> +	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> +	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
> +		goto release;
> +	ret = check_stable_address_space(vma->vm_mm);
> +	if (ret)
> +		goto release;

The clear+flush really belongs here.

> +	map_pmd_thp(folio, vmf, vma, haddr);
> +	__pmd_thp_fault_success_stats(vma);
> +	goto unlock;
> +release:
> +	folio_put(folio);
> +unlock:
> +	spin_unlock(vmf->ptl);
> +	mmu_notifier_invalidate_range_end(&range);
> +out:
> +	return ret;
> +}
> +
Dev Jain Sept. 11, 2024, 12:10 p.m. UTC | #2
On 9/11/24 15:06, David Hildenbrand wrote:
> On 11.09.24 08:56, Dev Jain wrote:
>> Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and
>> replace it with a PMD-mapped THP. Change the helper introduced in the
>> previous patch to flush TLB entry corresponding to the hugezeropage.
>> In case of failure, fallback to splitting the PMD.
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   mm/huge_memory.c | 52 +++++++++++++++++++++++++++++++++++++++++++++---
>>   1 file changed, 49 insertions(+), 3 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index b96a1ff2bf40..3e28946a805f 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -987,16 +987,20 @@ static void 
>> __pmd_thp_fault_success_stats(struct vm_area_struct *vma)
>>   static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
>>               struct vm_area_struct *vma, unsigned long haddr)
>>   {
>> -    pmd_t entry;
>> +    pmd_t entry, old_pmd;
>> +    bool is_pmd_none = pmd_none(*vmf->pmd);
>>         entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>>       entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>>       folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>>       folio_add_lru_vma(folio, vma);
>> +    if (!is_pmd_none)
>> +        old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
>
> This should likely be done in the caller.
>
>>       set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>>       update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>>       add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>> -    mm_inc_nr_ptes(vma->vm_mm);
>> +    if (is_pmd_none)
>> +        mm_inc_nr_ptes(vma->vm_mm);
>
> And this as well.
>
> No need to make this function deal with this if the callers exactly 
> know what they are doing.

Sure, thanks.


>
>>   }
>>     static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>> @@ -1576,6 +1580,41 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>       spin_unlock(vmf->ptl);
>>   }
>>   +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, 
>> unsigned long haddr)
>
> Is there a need to pass in "haddr" if we have the vmf?

Was passing it because it was getting used many times. But nowhere do vmf
and haddr get both passed in the codebase, so I'll drop it for 
cleanliness and
consistency.

>
>> +{
>> +    struct vm_area_struct *vma = vmf->vma;
>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>> +    struct mmu_notifier_range range;
>> +    struct folio *folio;
>> +    vm_fault_t ret = 0;
>> +
>> +    folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address);
>> +    if (unlikely(!folio)) {
>> +        ret = VM_FAULT_FALLBACK;
>> +        goto out;
>> +    }
>> +
>> +    mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, 
>> haddr,
>> +                haddr + HPAGE_PMD_SIZE);
>> +    mmu_notifier_invalidate_range_start(&range);
>> +    vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>> +    if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
>> +        goto release;
>> +    ret = check_stable_address_space(vma->vm_mm);
>> +    if (ret)
>> +        goto release;
>
> The clear+flush really belongs here.
>
>> +    map_pmd_thp(folio, vmf, vma, haddr);
>> +    __pmd_thp_fault_success_stats(vma);
>> +    goto unlock;
>> +release:
>> +    folio_put(folio);
>> +unlock:
>> +    spin_unlock(vmf->ptl);
>> +    mmu_notifier_invalidate_range_end(&range);
>> +out:
>> +    return ret;
>> +}
>> +
>
David Hildenbrand Sept. 11, 2024, 12:36 p.m. UTC | #3
>>
>>>    }
>>>      static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>> @@ -1576,6 +1580,41 @@ void huge_pmd_set_accessed(struct vm_fault *vmf)
>>>        spin_unlock(vmf->ptl);
>>>    }
>>>    +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf,
>>> unsigned long haddr)
>>
>> Is there a need to pass in "haddr" if we have the vmf?
> 
> Was passing it because it was getting used many times. But nowhere do vmf
> and haddr get both passed in the codebase, so I'll drop it for
> cleanliness and
> consistency.

Yes, the masking is very cheap.
kernel test robot Sept. 12, 2024, 3:44 p.m. UTC | #4
Hi Dev,

kernel test robot noticed the following build warnings:

[auto build test WARNING on v6.11-rc7]
[also build test WARNING on linus/master]
[cannot apply to akpm-mm/mm-everything next-20240912]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Dev-Jain/mm-Abstract-THP-allocation/20240911-145809
base:   v6.11-rc7
patch link:    https://lore.kernel.org/r/20240911065600.1002644-3-dev.jain%40arm.com
patch subject: [PATCH v3 2/2] mm: Allocate THP on hugezeropage wp-fault
config: x86_64-rhel-8.3-rust (https://download.01.org/0day-ci/archive/20240912/202409122349.PQp7sq2x-lkp@intel.com/config)
compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240912/202409122349.PQp7sq2x-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202409122349.PQp7sq2x-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> mm/huge_memory.c:990:15: warning: variable 'old_pmd' set but not used [-Wunused-but-set-variable]
     990 |         pmd_t entry, old_pmd;
         |                      ^
   mm/huge_memory.c:1016:6: warning: variable 'pgtable' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
    1016 |         if (unlikely(!folio)) {
         |             ^~~~~~~~~~~~~~~~
   include/linux/compiler.h:77:22: note: expanded from macro 'unlikely'
      77 | # define unlikely(x)    __builtin_expect(!!(x), 0)
         |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~
   mm/huge_memory.c:1055:6: note: uninitialized use occurs here
    1055 |         if (pgtable)
         |             ^~~~~~~
   mm/huge_memory.c:1016:2: note: remove the 'if' if its condition is always false
    1016 |         if (unlikely(!folio)) {
         |         ^~~~~~~~~~~~~~~~~~~~~~~
    1017 |                 ret = VM_FAULT_FALLBACK;
         |                 ~~~~~~~~~~~~~~~~~~~~~~~~
    1018 |                 goto release;
         |                 ~~~~~~~~~~~~~
    1019 |         }
         |         ~
   mm/huge_memory.c:1010:19: note: initialize the variable 'pgtable' to silence this warning
    1010 |         pgtable_t pgtable;
         |                          ^
         |                           = NULL
   2 warnings generated.


vim +/old_pmd +990 mm/huge_memory.c

   986	
   987	static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
   988				struct vm_area_struct *vma, unsigned long haddr)
   989	{
 > 990		pmd_t entry, old_pmd;
   991		bool is_pmd_none = pmd_none(*vmf->pmd);
   992	
   993		entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
   994		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
   995		folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
   996		folio_add_lru_vma(folio, vma);
   997		if (!is_pmd_none)
   998			old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
   999		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
  1000		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
  1001		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
  1002		if (is_pmd_none)
  1003			mm_inc_nr_ptes(vma->vm_mm);
  1004	}
  1005
diff mbox series

Patch

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b96a1ff2bf40..3e28946a805f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -987,16 +987,20 @@  static void __pmd_thp_fault_success_stats(struct vm_area_struct *vma)
 static void map_pmd_thp(struct folio *folio, struct vm_fault *vmf,
 			struct vm_area_struct *vma, unsigned long haddr)
 {
-	pmd_t entry;
+	pmd_t entry, old_pmd;
+	bool is_pmd_none = pmd_none(*vmf->pmd);
 
 	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
 	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
 	folio_add_lru_vma(folio, vma);
+	if (!is_pmd_none)
+		old_pmd = pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
 	set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 	update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
 	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-	mm_inc_nr_ptes(vma->vm_mm);
+	if (is_pmd_none)
+		mm_inc_nr_ptes(vma->vm_mm);
 }
 
 static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
@@ -1576,6 +1580,41 @@  void huge_pmd_set_accessed(struct vm_fault *vmf)
 	spin_unlock(vmf->ptl);
 }
 
+static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf, unsigned long haddr)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	gfp_t gfp = vma_thp_gfp_mask(vma);
+	struct mmu_notifier_range range;
+	struct folio *folio;
+	vm_fault_t ret = 0;
+
+	folio = pmd_thp_fault_alloc(gfp, vma, haddr, vmf->address);
+	if (unlikely(!folio)) {
+		ret = VM_FAULT_FALLBACK;
+		goto out;
+	}
+
+	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
+				haddr + HPAGE_PMD_SIZE);
+	mmu_notifier_invalidate_range_start(&range);
+	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+	if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
+		goto release;
+	ret = check_stable_address_space(vma->vm_mm);
+	if (ret)
+		goto release;
+	map_pmd_thp(folio, vmf, vma, haddr);
+	__pmd_thp_fault_success_stats(vma);
+	goto unlock;
+release:
+	folio_put(folio);
+unlock:
+	spin_unlock(vmf->ptl);
+	mmu_notifier_invalidate_range_end(&range);
+out:
+	return ret;
+}
+
 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
 {
 	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
@@ -1588,8 +1627,15 @@  vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
 	vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
 	VM_BUG_ON_VMA(!vma->anon_vma, vma);
 
-	if (is_huge_zero_pmd(orig_pmd))
+	if (is_huge_zero_pmd(orig_pmd)) {
+		vm_fault_t ret = do_huge_zero_wp_pmd(vmf, haddr);
+
+		if (!(ret & VM_FAULT_FALLBACK))
+			return ret;
+
+		/* Fallback to splitting PMD if THP cannot be allocated */
 		goto fallback;
+	}
 
 	spin_lock(vmf->ptl);