diff mbox series

[2/8] mm: memory: extend finish_fault() to support large folio

Message ID e3f4ae78ef2d565a65fadaa843e53a24bf5b57e4.1714978902.git.baolin.wang@linux.alibaba.com (mailing list archive)
State New
Headers show
Series add mTHP support for anonymous shmem | expand

Commit Message

Baolin Wang May 6, 2024, 8:46 a.m. UTC
Add large folio mapping establishment support for finish_fault() as a preparation,
to support multi-size THP allocation of anonymous shmem pages in the following
patches.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
 mm/memory.c | 43 +++++++++++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 10 deletions(-)

Comments

Ryan Roberts May 7, 2024, 10:37 a.m. UTC | #1
On 06/05/2024 09:46, Baolin Wang wrote:
> Add large folio mapping establishment support for finish_fault() as a preparation,
> to support multi-size THP allocation of anonymous shmem pages in the following
> patches.
> 
> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> ---
>  mm/memory.c | 43 +++++++++++++++++++++++++++++++++----------
>  1 file changed, 33 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index eea6e4984eae..936377220b77 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4747,9 +4747,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	struct page *page;
> +	struct folio *folio;
>  	vm_fault_t ret;
>  	bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>  		      !(vma->vm_flags & VM_SHARED);
> +	int type, nr_pages, i;
> +	unsigned long addr = vmf->address;
>  
>  	/* Did we COW the page? */
>  	if (is_cow)
> @@ -4780,24 +4783,44 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>  			return VM_FAULT_OOM;
>  	}
>  
> +	folio = page_folio(page);
> +	nr_pages = folio_nr_pages(folio);
> +
> +	if (unlikely(userfaultfd_armed(vma))) {
> +		nr_pages = 1;
> +	} else if (nr_pages > 1) {
> +		unsigned long start = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
> +		unsigned long end = start + nr_pages * PAGE_SIZE;
> +
> +		/* In case the folio size in page cache beyond the VMA limits. */
> +		addr = max(start, vma->vm_start);
> +		nr_pages = (min(end, vma->vm_end) - addr) >> PAGE_SHIFT;
> +
> +		page = folio_page(folio, (addr - start) >> PAGE_SHIFT);

I still don't really follow the logic in this else if block. Isn't it possible
that finish_fault() gets called with a page from a folio that isn't aligned with
vmf->address?

For example, let's say we have a file who's size is 64K and which is cached in a
single large folio in the page cache. But the file is mapped into a process at
VA 16K to 80K. Let's say we fault on the first page (VA=16K). You will calculate
start=0 and end=64K I think?

Additionally, I think this path will end up mapping the entire folio (as long as
it fits in the VMA). But this bypasses the fault-around configuration. As I
think I mentioned against the RFC, this will inflate the RSS of the process and
can cause behavioural changes as a result. I believe the current advice is to
disable fault-around to prevent this kind of bloat when needed.

It might be that you need a special variant of finish_fault() for shmem?


> +	}
>  	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> -				      vmf->address, &vmf->ptl);
> +				       addr, &vmf->ptl);
>  	if (!vmf->pte)
>  		return VM_FAULT_NOPAGE;
>  
>  	/* Re-check under ptl */
> -	if (likely(!vmf_pte_changed(vmf))) {
> -		struct folio *folio = page_folio(page);
> -		int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
> -
> -		set_pte_range(vmf, folio, page, 1, vmf->address);
> -		add_mm_counter(vma->vm_mm, type, 1);
> -		ret = 0;
> -	} else {
> -		update_mmu_tlb(vma, vmf->address, vmf->pte);
> +	if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
> +		update_mmu_tlb(vma, addr, vmf->pte);
> +		ret = VM_FAULT_NOPAGE;
> +		goto unlock;
> +	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
> +		for (i = 0; i < nr_pages; i++)
> +			update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
>  		ret = VM_FAULT_NOPAGE;
> +		goto unlock;
>  	}
>  
> +	set_pte_range(vmf, folio, page, nr_pages, addr);
> +	type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
> +	add_mm_counter(vma->vm_mm, type, nr_pages);
> +	ret = 0;
> +
> +unlock:
>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
>  	return ret;
>  }
Baolin Wang May 8, 2024, 3:44 a.m. UTC | #2
On 2024/5/7 18:37, Ryan Roberts wrote:
> On 06/05/2024 09:46, Baolin Wang wrote:
>> Add large folio mapping establishment support for finish_fault() as a preparation,
>> to support multi-size THP allocation of anonymous shmem pages in the following
>> patches.
>>
>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>> ---
>>   mm/memory.c | 43 +++++++++++++++++++++++++++++++++----------
>>   1 file changed, 33 insertions(+), 10 deletions(-)
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index eea6e4984eae..936377220b77 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4747,9 +4747,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>   {
>>   	struct vm_area_struct *vma = vmf->vma;
>>   	struct page *page;
>> +	struct folio *folio;
>>   	vm_fault_t ret;
>>   	bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>>   		      !(vma->vm_flags & VM_SHARED);
>> +	int type, nr_pages, i;
>> +	unsigned long addr = vmf->address;
>>   
>>   	/* Did we COW the page? */
>>   	if (is_cow)
>> @@ -4780,24 +4783,44 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>   			return VM_FAULT_OOM;
>>   	}
>>   
>> +	folio = page_folio(page);
>> +	nr_pages = folio_nr_pages(folio);
>> +
>> +	if (unlikely(userfaultfd_armed(vma))) {
>> +		nr_pages = 1;
>> +	} else if (nr_pages > 1) {
>> +		unsigned long start = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
>> +		unsigned long end = start + nr_pages * PAGE_SIZE;
>> +
>> +		/* In case the folio size in page cache beyond the VMA limits. */
>> +		addr = max(start, vma->vm_start);
>> +		nr_pages = (min(end, vma->vm_end) - addr) >> PAGE_SHIFT;
>> +
>> +		page = folio_page(folio, (addr - start) >> PAGE_SHIFT);
> 
> I still don't really follow the logic in this else if block. Isn't it possible
> that finish_fault() gets called with a page from a folio that isn't aligned with
> vmf->address?
> 
> For example, let's say we have a file who's size is 64K and which is cached in a
> single large folio in the page cache. But the file is mapped into a process at
> VA 16K to 80K. Let's say we fault on the first page (VA=16K). You will calculate

For shmem, this doesn't happen because the VA is aligned with the 
hugepage size in the shmem_get_unmapped_area() function. See patch 7.

> start=0 and end=64K I think?

Yes. Unfortunately, some file systems that support large mappings do not 
perform alignment for multi-size THP (non-PMD sized, for example: 64K). 
I think this requires modification to 
__get_unmapped_area--->thp_get_unmapped_area_vmflags() or 
file->f_op->get_unmapped_area() to align VA for multi-size THP in future.

So before adding that VA alignment changes, only allow building the 
large folio mapping for anonymous shmem:

diff --git a/mm/memory.c b/mm/memory.c
index 936377220b77..9e4d51826d23 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4786,7 +4786,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
         folio = page_folio(page);
         nr_pages = folio_nr_pages(folio);

-       if (unlikely(userfaultfd_armed(vma))) {
+       if (unlikely(userfaultfd_armed(vma)) || !vma_is_anon_shmem(vma)) {
                 nr_pages = 1;
         } else if (nr_pages > 1) {
                 unsigned long start = ALIGN_DOWN(vmf->address, nr_pages 
* PAGE_SIZE);

> Additionally, I think this path will end up mapping the entire folio (as long as
> it fits in the VMA). But this bypasses the fault-around configuration. As I
> think I mentioned against the RFC, this will inflate the RSS of the process and
> can cause behavioural changes as a result. I believe the current advice is to
> disable fault-around to prevent this kind of bloat when needed.

With above change, I do not think this is a problem? since users already 
want to use mTHP for anonymous shmem.

> It might be that you need a special variant of finish_fault() for shmem?
> 
> 
>> +	}
>>   	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>> -				      vmf->address, &vmf->ptl);
>> +				       addr, &vmf->ptl);
>>   	if (!vmf->pte)
>>   		return VM_FAULT_NOPAGE;
>>   
>>   	/* Re-check under ptl */
>> -	if (likely(!vmf_pte_changed(vmf))) {
>> -		struct folio *folio = page_folio(page);
>> -		int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
>> -
>> -		set_pte_range(vmf, folio, page, 1, vmf->address);
>> -		add_mm_counter(vma->vm_mm, type, 1);
>> -		ret = 0;
>> -	} else {
>> -		update_mmu_tlb(vma, vmf->address, vmf->pte);
>> +	if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
>> +		update_mmu_tlb(vma, addr, vmf->pte);
>> +		ret = VM_FAULT_NOPAGE;
>> +		goto unlock;
>> +	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
>> +		for (i = 0; i < nr_pages; i++)
>> +			update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
>>   		ret = VM_FAULT_NOPAGE;
>> +		goto unlock;
>>   	}
>>   
>> +	set_pte_range(vmf, folio, page, nr_pages, addr);
>> +	type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
>> +	add_mm_counter(vma->vm_mm, type, nr_pages);
>> +	ret = 0;
>> +
>> +unlock:
>>   	pte_unmap_unlock(vmf->pte, vmf->ptl);
>>   	return ret;
>>   }
David Hildenbrand May 8, 2024, 7:15 a.m. UTC | #3
On 08.05.24 05:44, Baolin Wang wrote:
> 
> 
> On 2024/5/7 18:37, Ryan Roberts wrote:
>> On 06/05/2024 09:46, Baolin Wang wrote:
>>> Add large folio mapping establishment support for finish_fault() as a preparation,
>>> to support multi-size THP allocation of anonymous shmem pages in the following
>>> patches.
>>>
>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>> ---
>>>    mm/memory.c | 43 +++++++++++++++++++++++++++++++++----------
>>>    1 file changed, 33 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index eea6e4984eae..936377220b77 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -4747,9 +4747,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>    {
>>>    	struct vm_area_struct *vma = vmf->vma;
>>>    	struct page *page;
>>> +	struct folio *folio;
>>>    	vm_fault_t ret;
>>>    	bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>>>    		      !(vma->vm_flags & VM_SHARED);
>>> +	int type, nr_pages, i;
>>> +	unsigned long addr = vmf->address;
>>>    
>>>    	/* Did we COW the page? */
>>>    	if (is_cow)
>>> @@ -4780,24 +4783,44 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>    			return VM_FAULT_OOM;
>>>    	}
>>>    
>>> +	folio = page_folio(page);
>>> +	nr_pages = folio_nr_pages(folio);
>>> +
>>> +	if (unlikely(userfaultfd_armed(vma))) {
>>> +		nr_pages = 1;
>>> +	} else if (nr_pages > 1) {
>>> +		unsigned long start = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
>>> +		unsigned long end = start + nr_pages * PAGE_SIZE;
>>> +
>>> +		/* In case the folio size in page cache beyond the VMA limits. */
>>> +		addr = max(start, vma->vm_start);
>>> +		nr_pages = (min(end, vma->vm_end) - addr) >> PAGE_SHIFT;
>>> +
>>> +		page = folio_page(folio, (addr - start) >> PAGE_SHIFT);
>>
>> I still don't really follow the logic in this else if block. Isn't it possible
>> that finish_fault() gets called with a page from a folio that isn't aligned with
>> vmf->address?
>>
>> For example, let's say we have a file who's size is 64K and which is cached in a
>> single large folio in the page cache. But the file is mapped into a process at
>> VA 16K to 80K. Let's say we fault on the first page (VA=16K). You will calculate
> 
> For shmem, this doesn't happen because the VA is aligned with the
> hugepage size in the shmem_get_unmapped_area() function. See patch 7.

Does that cover mremap() and MAP_FIXED as well.

We should try doing this as cleanly as possible, to prepare for the 
future / corner cases.
Ryan Roberts May 8, 2024, 8:53 a.m. UTC | #4
On 08/05/2024 04:44, Baolin Wang wrote:
> 
> 
> On 2024/5/7 18:37, Ryan Roberts wrote:
>> On 06/05/2024 09:46, Baolin Wang wrote:
>>> Add large folio mapping establishment support for finish_fault() as a
>>> preparation,
>>> to support multi-size THP allocation of anonymous shmem pages in the following
>>> patches.
>>>
>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>> ---
>>>   mm/memory.c | 43 +++++++++++++++++++++++++++++++++----------
>>>   1 file changed, 33 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index eea6e4984eae..936377220b77 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -4747,9 +4747,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>   {
>>>       struct vm_area_struct *vma = vmf->vma;
>>>       struct page *page;
>>> +    struct folio *folio;
>>>       vm_fault_t ret;
>>>       bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>>>                 !(vma->vm_flags & VM_SHARED);
>>> +    int type, nr_pages, i;
>>> +    unsigned long addr = vmf->address;
>>>         /* Did we COW the page? */
>>>       if (is_cow)
>>> @@ -4780,24 +4783,44 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>               return VM_FAULT_OOM;
>>>       }
>>>   +    folio = page_folio(page);
>>> +    nr_pages = folio_nr_pages(folio);
>>> +
>>> +    if (unlikely(userfaultfd_armed(vma))) {
>>> +        nr_pages = 1;
>>> +    } else if (nr_pages > 1) {
>>> +        unsigned long start = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
>>> +        unsigned long end = start + nr_pages * PAGE_SIZE;
>>> +
>>> +        /* In case the folio size in page cache beyond the VMA limits. */
>>> +        addr = max(start, vma->vm_start);
>>> +        nr_pages = (min(end, vma->vm_end) - addr) >> PAGE_SHIFT;
>>> +
>>> +        page = folio_page(folio, (addr - start) >> PAGE_SHIFT);
>>
>> I still don't really follow the logic in this else if block. Isn't it possible
>> that finish_fault() gets called with a page from a folio that isn't aligned with
>> vmf->address?
>>
>> For example, let's say we have a file who's size is 64K and which is cached in a
>> single large folio in the page cache. But the file is mapped into a process at
>> VA 16K to 80K. Let's say we fault on the first page (VA=16K). You will calculate
> 
> For shmem, this doesn't happen because the VA is aligned with the hugepage size
> in the shmem_get_unmapped_area() function. See patch 7.

Certainly agree that shmem can always make sure that it packs a vma in a way
such that its folios are naturally aligned in VA when faulting in memory. If you
mremap it, that alignment will be lost; I don't think that would be a problem
for a single process; mremap will take care of moving the ptes correctly and
this path is not involved.

But what about the case when a process mmaps a shmem region, then forks, then
the child mremaps the shmem region. Then the parent faults in a THP into the
region (nicely aligned). Then the child faults in the same offset in the region
and gets the THP that the parent allocated; that THP will be aligned in the
parent's VM space but not in the child's.

> 
>> start=0 and end=64K I think?
> 
> Yes. Unfortunately, some file systems that support large mappings do not perform
> alignment for multi-size THP (non-PMD sized, for example: 64K). I think this
> requires modification to __get_unmapped_area--->thp_get_unmapped_area_vmflags()
> or file->f_op->get_unmapped_area() to align VA for multi-size THP in future.

By nature of the fact that a file mapping is shared between multiple processes
and each process can map it where ever it wants down to 1 page granularity, its
impossible for any THP containing a part of that file to be VA-aligned in every
process it is mapped in.

> 
> So before adding that VA alignment changes, only allow building the large folio
> mapping for anonymous shmem:
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 936377220b77..9e4d51826d23 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4786,7 +4786,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>         folio = page_folio(page);
>         nr_pages = folio_nr_pages(folio);
> 
> -       if (unlikely(userfaultfd_armed(vma))) {
> +       if (unlikely(userfaultfd_armed(vma)) || !vma_is_anon_shmem(vma)) {

If the above theoretical flow for fork & mremap is valid, then I don't think
this is sufficient.

>                 nr_pages = 1;
>         } else if (nr_pages > 1) {
>                 unsigned long start = ALIGN_DOWN(vmf->address, nr_pages *
> PAGE_SIZE);
> 
>> Additionally, I think this path will end up mapping the entire folio (as long as
>> it fits in the VMA). But this bypasses the fault-around configuration. As I
>> think I mentioned against the RFC, this will inflate the RSS of the process and
>> can cause behavioural changes as a result. I believe the current advice is to
>> disable fault-around to prevent this kind of bloat when needed.
> 
> With above change, I do not think this is a problem? since users already want to
> use mTHP for anonymous shmem.
> 
>> It might be that you need a special variant of finish_fault() for shmem?
>>
>>
>>> +    }
>>>       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>>> -                      vmf->address, &vmf->ptl);
>>> +                       addr, &vmf->ptl);
>>>       if (!vmf->pte)
>>>           return VM_FAULT_NOPAGE;
>>>         /* Re-check under ptl */
>>> -    if (likely(!vmf_pte_changed(vmf))) {
>>> -        struct folio *folio = page_folio(page);
>>> -        int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
>>> -
>>> -        set_pte_range(vmf, folio, page, 1, vmf->address);
>>> -        add_mm_counter(vma->vm_mm, type, 1);
>>> -        ret = 0;
>>> -    } else {
>>> -        update_mmu_tlb(vma, vmf->address, vmf->pte);
>>> +    if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
>>> +        update_mmu_tlb(vma, addr, vmf->pte);
>>> +        ret = VM_FAULT_NOPAGE;
>>> +        goto unlock;
>>> +    } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
>>> +        for (i = 0; i < nr_pages; i++)
>>> +            update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
>>>           ret = VM_FAULT_NOPAGE;
>>> +        goto unlock;
>>>       }
>>>   +    set_pte_range(vmf, folio, page, nr_pages, addr);
>>> +    type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
>>> +    add_mm_counter(vma->vm_mm, type, nr_pages);
>>> +    ret = 0;
>>> +
>>> +unlock:
>>>       pte_unmap_unlock(vmf->pte, vmf->ptl);
>>>       return ret;
>>>   }
Baolin Wang May 8, 2024, 9:06 a.m. UTC | #5
On 2024/5/8 15:15, David Hildenbrand wrote:
> On 08.05.24 05:44, Baolin Wang wrote:
>>
>>
>> On 2024/5/7 18:37, Ryan Roberts wrote:
>>> On 06/05/2024 09:46, Baolin Wang wrote:
>>>> Add large folio mapping establishment support for finish_fault() as 
>>>> a preparation,
>>>> to support multi-size THP allocation of anonymous shmem pages in the 
>>>> following
>>>> patches.
>>>>
>>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>> ---
>>>>    mm/memory.c | 43 +++++++++++++++++++++++++++++++++----------
>>>>    1 file changed, 33 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>> index eea6e4984eae..936377220b77 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -4747,9 +4747,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>    {
>>>>        struct vm_area_struct *vma = vmf->vma;
>>>>        struct page *page;
>>>> +    struct folio *folio;
>>>>        vm_fault_t ret;
>>>>        bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>>>>                  !(vma->vm_flags & VM_SHARED);
>>>> +    int type, nr_pages, i;
>>>> +    unsigned long addr = vmf->address;
>>>>        /* Did we COW the page? */
>>>>        if (is_cow)
>>>> @@ -4780,24 +4783,44 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>                return VM_FAULT_OOM;
>>>>        }
>>>> +    folio = page_folio(page);
>>>> +    nr_pages = folio_nr_pages(folio);
>>>> +
>>>> +    if (unlikely(userfaultfd_armed(vma))) {
>>>> +        nr_pages = 1;
>>>> +    } else if (nr_pages > 1) {
>>>> +        unsigned long start = ALIGN_DOWN(vmf->address, nr_pages * 
>>>> PAGE_SIZE);
>>>> +        unsigned long end = start + nr_pages * PAGE_SIZE;
>>>> +
>>>> +        /* In case the folio size in page cache beyond the VMA 
>>>> limits. */
>>>> +        addr = max(start, vma->vm_start);
>>>> +        nr_pages = (min(end, vma->vm_end) - addr) >> PAGE_SHIFT;
>>>> +
>>>> +        page = folio_page(folio, (addr - start) >> PAGE_SHIFT);
>>>
>>> I still don't really follow the logic in this else if block. Isn't it 
>>> possible
>>> that finish_fault() gets called with a page from a folio that isn't 
>>> aligned with
>>> vmf->address?
>>>
>>> For example, let's say we have a file who's size is 64K and which is 
>>> cached in a
>>> single large folio in the page cache. But the file is mapped into a 
>>> process at
>>> VA 16K to 80K. Let's say we fault on the first page (VA=16K). You 
>>> will calculate
>>
>> For shmem, this doesn't happen because the VA is aligned with the
>> hugepage size in the shmem_get_unmapped_area() function. See patch 7.
> 
> Does that cover mremap() and MAP_FIXED as well.

Good point. Thanks for pointing this out.

> We should try doing this as cleanly as possible, to prepare for the 
> future / corner cases.

Sure. Let me re-think about the algorithm.
Baolin Wang May 8, 2024, 9:31 a.m. UTC | #6
On 2024/5/8 16:53, Ryan Roberts wrote:
> On 08/05/2024 04:44, Baolin Wang wrote:
>>
>>
>> On 2024/5/7 18:37, Ryan Roberts wrote:
>>> On 06/05/2024 09:46, Baolin Wang wrote:
>>>> Add large folio mapping establishment support for finish_fault() as a
>>>> preparation,
>>>> to support multi-size THP allocation of anonymous shmem pages in the following
>>>> patches.
>>>>
>>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>> ---
>>>>    mm/memory.c | 43 +++++++++++++++++++++++++++++++++----------
>>>>    1 file changed, 33 insertions(+), 10 deletions(-)
>>>>
>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>> index eea6e4984eae..936377220b77 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -4747,9 +4747,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>    {
>>>>        struct vm_area_struct *vma = vmf->vma;
>>>>        struct page *page;
>>>> +    struct folio *folio;
>>>>        vm_fault_t ret;
>>>>        bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>>>>                  !(vma->vm_flags & VM_SHARED);
>>>> +    int type, nr_pages, i;
>>>> +    unsigned long addr = vmf->address;
>>>>          /* Did we COW the page? */
>>>>        if (is_cow)
>>>> @@ -4780,24 +4783,44 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>                return VM_FAULT_OOM;
>>>>        }
>>>>    +    folio = page_folio(page);
>>>> +    nr_pages = folio_nr_pages(folio);
>>>> +
>>>> +    if (unlikely(userfaultfd_armed(vma))) {
>>>> +        nr_pages = 1;
>>>> +    } else if (nr_pages > 1) {
>>>> +        unsigned long start = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
>>>> +        unsigned long end = start + nr_pages * PAGE_SIZE;
>>>> +
>>>> +        /* In case the folio size in page cache beyond the VMA limits. */
>>>> +        addr = max(start, vma->vm_start);
>>>> +        nr_pages = (min(end, vma->vm_end) - addr) >> PAGE_SHIFT;
>>>> +
>>>> +        page = folio_page(folio, (addr - start) >> PAGE_SHIFT);
>>>
>>> I still don't really follow the logic in this else if block. Isn't it possible
>>> that finish_fault() gets called with a page from a folio that isn't aligned with
>>> vmf->address?
>>>
>>> For example, let's say we have a file who's size is 64K and which is cached in a
>>> single large folio in the page cache. But the file is mapped into a process at
>>> VA 16K to 80K. Let's say we fault on the first page (VA=16K). You will calculate
>>
>> For shmem, this doesn't happen because the VA is aligned with the hugepage size
>> in the shmem_get_unmapped_area() function. See patch 7.
> 
> Certainly agree that shmem can always make sure that it packs a vma in a way
> such that its folios are naturally aligned in VA when faulting in memory. If you
> mremap it, that alignment will be lost; I don't think that would be a problem

When mremap it, it will also call shmem_get_unmapped_area() to align the 
VA, but for mremap() with MAP_FIXED flag as David pointed out, yes, this 
patch may be not work perfectly.

> for a single process; mremap will take care of moving the ptes correctly and
> this path is not involved.
> 
> But what about the case when a process mmaps a shmem region, then forks, then
> the child mremaps the shmem region. Then the parent faults in a THP into the
> region (nicely aligned). Then the child faults in the same offset in the region
> and gets the THP that the parent allocated; that THP will be aligned in the
> parent's VM space but not in the child's.

Sorry, I did not get your point here. IIUC, the child's VA will also be 
aligned if the child mremap is not set MAP_FIXED, since the child's 
mremap will still call shmem_get_unmapped_area() to find an aligned new 
VA. Please correct me if I missed your point.

>>> start=0 and end=64K I think?
>>
>> Yes. Unfortunately, some file systems that support large mappings do not perform
>> alignment for multi-size THP (non-PMD sized, for example: 64K). I think this
>> requires modification to __get_unmapped_area--->thp_get_unmapped_area_vmflags()
>> or file->f_op->get_unmapped_area() to align VA for multi-size THP in future.
> 
> By nature of the fact that a file mapping is shared between multiple processes
> and each process can map it where ever it wants down to 1 page granularity, its
> impossible for any THP containing a part of that file to be VA-aligned in every
> process it is mapped in.

Yes, so let me re-polish this patch. Thanks.

>> So before adding that VA alignment changes, only allow building the large folio
>> mapping for anonymous shmem:
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 936377220b77..9e4d51826d23 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4786,7 +4786,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>          folio = page_folio(page);
>>          nr_pages = folio_nr_pages(folio);
>>
>> -       if (unlikely(userfaultfd_armed(vma))) {
>> +       if (unlikely(userfaultfd_armed(vma)) || !vma_is_anon_shmem(vma)) {
> 
> If the above theoretical flow for fork & mremap is valid, then I don't think
> this is sufficient.
> 
>>                  nr_pages = 1;
>>          } else if (nr_pages > 1) {
>>                  unsigned long start = ALIGN_DOWN(vmf->address, nr_pages *
>> PAGE_SIZE);
>>
>>> Additionally, I think this path will end up mapping the entire folio (as long as
>>> it fits in the VMA). But this bypasses the fault-around configuration. As I
>>> think I mentioned against the RFC, this will inflate the RSS of the process and
>>> can cause behavioural changes as a result. I believe the current advice is to
>>> disable fault-around to prevent this kind of bloat when needed.
>>
>> With above change, I do not think this is a problem? since users already want to
>> use mTHP for anonymous shmem.
>>
>>> It might be that you need a special variant of finish_fault() for shmem?
>>>
>>>
>>>> +    }
>>>>        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>>>> -                      vmf->address, &vmf->ptl);
>>>> +                       addr, &vmf->ptl);
>>>>        if (!vmf->pte)
>>>>            return VM_FAULT_NOPAGE;
>>>>          /* Re-check under ptl */
>>>> -    if (likely(!vmf_pte_changed(vmf))) {
>>>> -        struct folio *folio = page_folio(page);
>>>> -        int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
>>>> -
>>>> -        set_pte_range(vmf, folio, page, 1, vmf->address);
>>>> -        add_mm_counter(vma->vm_mm, type, 1);
>>>> -        ret = 0;
>>>> -    } else {
>>>> -        update_mmu_tlb(vma, vmf->address, vmf->pte);
>>>> +    if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
>>>> +        update_mmu_tlb(vma, addr, vmf->pte);
>>>> +        ret = VM_FAULT_NOPAGE;
>>>> +        goto unlock;
>>>> +    } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
>>>> +        for (i = 0; i < nr_pages; i++)
>>>> +            update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
>>>>            ret = VM_FAULT_NOPAGE;
>>>> +        goto unlock;
>>>>        }
>>>>    +    set_pte_range(vmf, folio, page, nr_pages, addr);
>>>> +    type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
>>>> +    add_mm_counter(vma->vm_mm, type, nr_pages);
>>>> +    ret = 0;
>>>> +
>>>> +unlock:
>>>>        pte_unmap_unlock(vmf->pte, vmf->ptl);
>>>>        return ret;
>>>>    }
Ryan Roberts May 8, 2024, 10:47 a.m. UTC | #7
On 08/05/2024 10:31, Baolin Wang wrote:
> 
> 
> On 2024/5/8 16:53, Ryan Roberts wrote:
>> On 08/05/2024 04:44, Baolin Wang wrote:
>>>
>>>
>>> On 2024/5/7 18:37, Ryan Roberts wrote:
>>>> On 06/05/2024 09:46, Baolin Wang wrote:
>>>>> Add large folio mapping establishment support for finish_fault() as a
>>>>> preparation,
>>>>> to support multi-size THP allocation of anonymous shmem pages in the following
>>>>> patches.
>>>>>
>>>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>>> ---
>>>>>    mm/memory.c | 43 +++++++++++++++++++++++++++++++++----------
>>>>>    1 file changed, 33 insertions(+), 10 deletions(-)
>>>>>
>>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>>> index eea6e4984eae..936377220b77 100644
>>>>> --- a/mm/memory.c
>>>>> +++ b/mm/memory.c
>>>>> @@ -4747,9 +4747,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>>    {
>>>>>        struct vm_area_struct *vma = vmf->vma;
>>>>>        struct page *page;
>>>>> +    struct folio *folio;
>>>>>        vm_fault_t ret;
>>>>>        bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>>>>>                  !(vma->vm_flags & VM_SHARED);
>>>>> +    int type, nr_pages, i;
>>>>> +    unsigned long addr = vmf->address;
>>>>>          /* Did we COW the page? */
>>>>>        if (is_cow)
>>>>> @@ -4780,24 +4783,44 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>>                return VM_FAULT_OOM;
>>>>>        }
>>>>>    +    folio = page_folio(page);
>>>>> +    nr_pages = folio_nr_pages(folio);
>>>>> +
>>>>> +    if (unlikely(userfaultfd_armed(vma))) {
>>>>> +        nr_pages = 1;
>>>>> +    } else if (nr_pages > 1) {
>>>>> +        unsigned long start = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
>>>>> +        unsigned long end = start + nr_pages * PAGE_SIZE;
>>>>> +
>>>>> +        /* In case the folio size in page cache beyond the VMA limits. */
>>>>> +        addr = max(start, vma->vm_start);
>>>>> +        nr_pages = (min(end, vma->vm_end) - addr) >> PAGE_SHIFT;
>>>>> +
>>>>> +        page = folio_page(folio, (addr - start) >> PAGE_SHIFT);
>>>>
>>>> I still don't really follow the logic in this else if block. Isn't it possible
>>>> that finish_fault() gets called with a page from a folio that isn't aligned
>>>> with
>>>> vmf->address?
>>>>
>>>> For example, let's say we have a file who's size is 64K and which is cached
>>>> in a
>>>> single large folio in the page cache. But the file is mapped into a process at
>>>> VA 16K to 80K. Let's say we fault on the first page (VA=16K). You will
>>>> calculate
>>>
>>> For shmem, this doesn't happen because the VA is aligned with the hugepage size
>>> in the shmem_get_unmapped_area() function. See patch 7.
>>
>> Certainly agree that shmem can always make sure that it packs a vma in a way
>> such that its folios are naturally aligned in VA when faulting in memory. If you
>> mremap it, that alignment will be lost; I don't think that would be a problem
> 
> When mremap it, it will also call shmem_get_unmapped_area() to align the VA, but
> for mremap() with MAP_FIXED flag as David pointed out, yes, this patch may be
> not work perfectly.

Assuming this works similarly to anon mTHP, remapping to an arbitrary address
shouldn't be a problem within a single process; the previously allocated folios
will now be unaligned, but they will be correctly mapped so it doesn't break
anything. And new faults will allocate folios so that they are as large as
allowed by the sysfs interface AND which do not overlap with any non-none pte
AND which are naturally aligned. It's when you start sharing with other
processes that the fun and games start...

> 
>> for a single process; mremap will take care of moving the ptes correctly and
>> this path is not involved.
>>
>> But what about the case when a process mmaps a shmem region, then forks, then
>> the child mremaps the shmem region. Then the parent faults in a THP into the
>> region (nicely aligned). Then the child faults in the same offset in the region
>> and gets the THP that the parent allocated; that THP will be aligned in the
>> parent's VM space but not in the child's.
> 
> Sorry, I did not get your point here. IIUC, the child's VA will also be aligned
> if the child mremap is not set MAP_FIXED, since the child's mremap will still
> call shmem_get_unmapped_area() to find an aligned new VA.

In general, you shouldn't be relying on the vma bounds being aligned to a THP
boundary.

> Please correct me if I missed your point.

(I'm not 100% sure this is definitely how it works, but seems the only sane way
to me):

Let's imagine we have a process that maps 4 pages of shared anon memory at VA=64K:

  mmap(64K, 16K, PROT_X, MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, ...)

Then it forks a child, and the child moves the mapping to VA=68K:

  mremap(64K, 16K, 16K, MREMAP_FIXED | MREMAP_MAYMOVE, 68K)

Then the parent writes to address 64K (offset 0 in the shared region); this will
fault and cause a 16K mTHP to be allocated and mapped, covering the whole region
at 64K-80K in the parent.

Then the child reads address 68K (offset 0 in the shared region); this will
fault and cause the previously allocated 16K folio to be looked up and it must
be mapped in the child between 68K-84K. This is not naturally aligned in the child.

For the child, your code will incorrectly calculate start/end as 64K-80K.

> 
>>>> start=0 and end=64K I think?
>>>
>>> Yes. Unfortunately, some file systems that support large mappings do not perform
>>> alignment for multi-size THP (non-PMD sized, for example: 64K). I think this
>>> requires modification to __get_unmapped_area--->thp_get_unmapped_area_vmflags()
>>> or file->f_op->get_unmapped_area() to align VA for multi-size THP in future.
>>
>> By nature of the fact that a file mapping is shared between multiple processes
>> and each process can map it where ever it wants down to 1 page granularity, its
>> impossible for any THP containing a part of that file to be VA-aligned in every
>> process it is mapped in.
> 
> Yes, so let me re-polish this patch. Thanks.
> 
>>> So before adding that VA alignment changes, only allow building the large folio
>>> mapping for anonymous shmem:
>>>
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 936377220b77..9e4d51826d23 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -4786,7 +4786,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>          folio = page_folio(page);
>>>          nr_pages = folio_nr_pages(folio);
>>>
>>> -       if (unlikely(userfaultfd_armed(vma))) {
>>> +       if (unlikely(userfaultfd_armed(vma)) || !vma_is_anon_shmem(vma)) {
>>
>> If the above theoretical flow for fork & mremap is valid, then I don't think
>> this is sufficient.
>>
>>>                  nr_pages = 1;
>>>          } else if (nr_pages > 1) {
>>>                  unsigned long start = ALIGN_DOWN(vmf->address, nr_pages *
>>> PAGE_SIZE);
>>>
>>>> Additionally, I think this path will end up mapping the entire folio (as
>>>> long as
>>>> it fits in the VMA). But this bypasses the fault-around configuration. As I
>>>> think I mentioned against the RFC, this will inflate the RSS of the process and
>>>> can cause behavioural changes as a result. I believe the current advice is to
>>>> disable fault-around to prevent this kind of bloat when needed.
>>>
>>> With above change, I do not think this is a problem? since users already want to
>>> use mTHP for anonymous shmem.
>>>
>>>> It might be that you need a special variant of finish_fault() for shmem?
>>>>
>>>>
>>>>> +    }
>>>>>        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>>>>> -                      vmf->address, &vmf->ptl);
>>>>> +                       addr, &vmf->ptl);
>>>>>        if (!vmf->pte)
>>>>>            return VM_FAULT_NOPAGE;
>>>>>          /* Re-check under ptl */
>>>>> -    if (likely(!vmf_pte_changed(vmf))) {
>>>>> -        struct folio *folio = page_folio(page);
>>>>> -        int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
>>>>> -
>>>>> -        set_pte_range(vmf, folio, page, 1, vmf->address);
>>>>> -        add_mm_counter(vma->vm_mm, type, 1);
>>>>> -        ret = 0;
>>>>> -    } else {
>>>>> -        update_mmu_tlb(vma, vmf->address, vmf->pte);
>>>>> +    if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
>>>>> +        update_mmu_tlb(vma, addr, vmf->pte);
>>>>> +        ret = VM_FAULT_NOPAGE;
>>>>> +        goto unlock;
>>>>> +    } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
>>>>> +        for (i = 0; i < nr_pages; i++)
>>>>> +            update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
>>>>>            ret = VM_FAULT_NOPAGE;
>>>>> +        goto unlock;
>>>>>        }
>>>>>    +    set_pte_range(vmf, folio, page, nr_pages, addr);
>>>>> +    type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
>>>>> +    add_mm_counter(vma->vm_mm, type, nr_pages);
>>>>> +    ret = 0;
>>>>> +
>>>>> +unlock:
>>>>>        pte_unmap_unlock(vmf->pte, vmf->ptl);
>>>>>        return ret;
>>>>>    }
Baolin Wang May 9, 2024, 1:10 a.m. UTC | #8
On 2024/5/8 18:47, Ryan Roberts wrote:
> On 08/05/2024 10:31, Baolin Wang wrote:
>>
>>
>> On 2024/5/8 16:53, Ryan Roberts wrote:
>>> On 08/05/2024 04:44, Baolin Wang wrote:
>>>>
>>>>
>>>> On 2024/5/7 18:37, Ryan Roberts wrote:
>>>>> On 06/05/2024 09:46, Baolin Wang wrote:
>>>>>> Add large folio mapping establishment support for finish_fault() as a
>>>>>> preparation,
>>>>>> to support multi-size THP allocation of anonymous shmem pages in the following
>>>>>> patches.
>>>>>>
>>>>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>>>> ---
>>>>>>     mm/memory.c | 43 +++++++++++++++++++++++++++++++++----------
>>>>>>     1 file changed, 33 insertions(+), 10 deletions(-)
>>>>>>
>>>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>>>> index eea6e4984eae..936377220b77 100644
>>>>>> --- a/mm/memory.c
>>>>>> +++ b/mm/memory.c
>>>>>> @@ -4747,9 +4747,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>>>     {
>>>>>>         struct vm_area_struct *vma = vmf->vma;
>>>>>>         struct page *page;
>>>>>> +    struct folio *folio;
>>>>>>         vm_fault_t ret;
>>>>>>         bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>>>>>>                   !(vma->vm_flags & VM_SHARED);
>>>>>> +    int type, nr_pages, i;
>>>>>> +    unsigned long addr = vmf->address;
>>>>>>           /* Did we COW the page? */
>>>>>>         if (is_cow)
>>>>>> @@ -4780,24 +4783,44 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>>>                 return VM_FAULT_OOM;
>>>>>>         }
>>>>>>     +    folio = page_folio(page);
>>>>>> +    nr_pages = folio_nr_pages(folio);
>>>>>> +
>>>>>> +    if (unlikely(userfaultfd_armed(vma))) {
>>>>>> +        nr_pages = 1;
>>>>>> +    } else if (nr_pages > 1) {
>>>>>> +        unsigned long start = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
>>>>>> +        unsigned long end = start + nr_pages * PAGE_SIZE;
>>>>>> +
>>>>>> +        /* In case the folio size in page cache beyond the VMA limits. */
>>>>>> +        addr = max(start, vma->vm_start);
>>>>>> +        nr_pages = (min(end, vma->vm_end) - addr) >> PAGE_SHIFT;
>>>>>> +
>>>>>> +        page = folio_page(folio, (addr - start) >> PAGE_SHIFT);
>>>>>
>>>>> I still don't really follow the logic in this else if block. Isn't it possible
>>>>> that finish_fault() gets called with a page from a folio that isn't aligned
>>>>> with
>>>>> vmf->address?
>>>>>
>>>>> For example, let's say we have a file who's size is 64K and which is cached
>>>>> in a
>>>>> single large folio in the page cache. But the file is mapped into a process at
>>>>> VA 16K to 80K. Let's say we fault on the first page (VA=16K). You will
>>>>> calculate
>>>>
>>>> For shmem, this doesn't happen because the VA is aligned with the hugepage size
>>>> in the shmem_get_unmapped_area() function. See patch 7.
>>>
>>> Certainly agree that shmem can always make sure that it packs a vma in a way
>>> such that its folios are naturally aligned in VA when faulting in memory. If you
>>> mremap it, that alignment will be lost; I don't think that would be a problem
>>
>> When mremap it, it will also call shmem_get_unmapped_area() to align the VA, but
>> for mremap() with MAP_FIXED flag as David pointed out, yes, this patch may be
>> not work perfectly.
> 
> Assuming this works similarly to anon mTHP, remapping to an arbitrary address
> shouldn't be a problem within a single process; the previously allocated folios
> will now be unaligned, but they will be correctly mapped so it doesn't break
> anything. And new faults will allocate folios so that they are as large as
> allowed by the sysfs interface AND which do not overlap with any non-none pte
> AND which are naturally aligned. It's when you start sharing with other
> processes that the fun and games start...
> 
>>
>>> for a single process; mremap will take care of moving the ptes correctly and
>>> this path is not involved.
>>>
>>> But what about the case when a process mmaps a shmem region, then forks, then
>>> the child mremaps the shmem region. Then the parent faults in a THP into the
>>> region (nicely aligned). Then the child faults in the same offset in the region
>>> and gets the THP that the parent allocated; that THP will be aligned in the
>>> parent's VM space but not in the child's.
>>
>> Sorry, I did not get your point here. IIUC, the child's VA will also be aligned
>> if the child mremap is not set MAP_FIXED, since the child's mremap will still
>> call shmem_get_unmapped_area() to find an aligned new VA.
> 
> In general, you shouldn't be relying on the vma bounds being aligned to a THP
> boundary.
> 
>> Please correct me if I missed your point.
> 
> (I'm not 100% sure this is definitely how it works, but seems the only sane way
> to me):
> 
> Let's imagine we have a process that maps 4 pages of shared anon memory at VA=64K:
> 
>    mmap(64K, 16K, PROT_X, MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, ...)
> 
> Then it forks a child, and the child moves the mapping to VA=68K:
> 
>    mremap(64K, 16K, 16K, MREMAP_FIXED | MREMAP_MAYMOVE, 68K)
> 
> Then the parent writes to address 64K (offset 0 in the shared region); this will
> fault and cause a 16K mTHP to be allocated and mapped, covering the whole region
> at 64K-80K in the parent.
> 
> Then the child reads address 68K (offset 0 in the shared region); this will
> fault and cause the previously allocated 16K folio to be looked up and it must
> be mapped in the child between 68K-84K. This is not naturally aligned in the child.
> 
> For the child, your code will incorrectly calculate start/end as 64K-80K.

OK, so you set MREMAP_FIXED flag, just as David pointed out. Yes, it 
will not aligned in the child for this case. Thanks for the explanation.
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index eea6e4984eae..936377220b77 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4747,9 +4747,12 @@  vm_fault_t finish_fault(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct page *page;
+	struct folio *folio;
 	vm_fault_t ret;
 	bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
 		      !(vma->vm_flags & VM_SHARED);
+	int type, nr_pages, i;
+	unsigned long addr = vmf->address;
 
 	/* Did we COW the page? */
 	if (is_cow)
@@ -4780,24 +4783,44 @@  vm_fault_t finish_fault(struct vm_fault *vmf)
 			return VM_FAULT_OOM;
 	}
 
+	folio = page_folio(page);
+	nr_pages = folio_nr_pages(folio);
+
+	if (unlikely(userfaultfd_armed(vma))) {
+		nr_pages = 1;
+	} else if (nr_pages > 1) {
+		unsigned long start = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
+		unsigned long end = start + nr_pages * PAGE_SIZE;
+
+		/* In case the folio size in page cache beyond the VMA limits. */
+		addr = max(start, vma->vm_start);
+		nr_pages = (min(end, vma->vm_end) - addr) >> PAGE_SHIFT;
+
+		page = folio_page(folio, (addr - start) >> PAGE_SHIFT);
+	}
 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-				      vmf->address, &vmf->ptl);
+				       addr, &vmf->ptl);
 	if (!vmf->pte)
 		return VM_FAULT_NOPAGE;
 
 	/* Re-check under ptl */
-	if (likely(!vmf_pte_changed(vmf))) {
-		struct folio *folio = page_folio(page);
-		int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
-
-		set_pte_range(vmf, folio, page, 1, vmf->address);
-		add_mm_counter(vma->vm_mm, type, 1);
-		ret = 0;
-	} else {
-		update_mmu_tlb(vma, vmf->address, vmf->pte);
+	if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
+		update_mmu_tlb(vma, addr, vmf->pte);
+		ret = VM_FAULT_NOPAGE;
+		goto unlock;
+	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
+		for (i = 0; i < nr_pages; i++)
+			update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
 		ret = VM_FAULT_NOPAGE;
+		goto unlock;
 	}
 
+	set_pte_range(vmf, folio, page, nr_pages, addr);
+	type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
+	add_mm_counter(vma->vm_mm, type, nr_pages);
+	ret = 0;
+
+unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
 	return ret;
 }