diff mbox series

[RFC,1/5] mm: memory: extend finish_fault() to support large folio

Message ID 358aefb1858b63164894d7d8504f3dae0b495366.1713755580.git.baolin.wang@linux.alibaba.com (mailing list archive)
State New
Headers show
Series add mTHP support for anonymous share pages | expand

Commit Message

Baolin Wang April 22, 2024, 7:02 a.m. UTC
Add large folio mapping establishment support for finish_fault() as a preparation,
to support multi-size THP allocation of anonymous shared pages in the following
patches.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
 mm/memory.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

Comments

Lance Yang April 23, 2024, 8:39 a.m. UTC | #1
Hey Baolin,

[...]
@@ -4727,9 +4725,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
[...]

The mm counters have been moved from updating out of set_pte_range(), so we
we may need to rebase against Kefeng's patch[1].

+	int type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
	set_pte_range(vmf, folio, &folio->page, nr_pages, addr);
+	add_mm_counter(vma->vm_mm, type, nr_pages);
[...]

[1] https://lore.kernel.org/all/20240412064751.119015-1-wangkefeng.wang@huawei.com/T/#me2827c26ff453c0fa86f2af9f68f245978b08774

Thanks,
Lance
Ryan Roberts April 23, 2024, 11:03 a.m. UTC | #2
On 22/04/2024 08:02, Baolin Wang wrote:
> Add large folio mapping establishment support for finish_fault() as a preparation,
> to support multi-size THP allocation of anonymous shared pages in the following
> patches.
> 
> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> ---
>  mm/memory.c | 25 ++++++++++++++++++-------
>  1 file changed, 18 insertions(+), 7 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index b6fa5146b260..094a76730776 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4766,7 +4766,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	struct page *page;
> +	struct folio *folio;
>  	vm_fault_t ret;
> +	int nr_pages, i;
> +	unsigned long addr;
>  
>  	/* Did we COW the page? */
>  	if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
> @@ -4797,22 +4800,30 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>  			return VM_FAULT_OOM;
>  	}
>  
> +	folio = page_folio(page);
> +	nr_pages = folio_nr_pages(folio);
> +	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);

I'm not sure this is safe. IIUC, finish_fault() is called for any file-backed
mapping. So you could have a situation where part of a (regular) file is mapped
in the process, faults and hits in the pagecache. But the folio returned by the
pagecache is bigger than the portion that the process has mapped. So you now end
up mapping beyond the VMA limits? In the pagecache case, you also can't assume
that the folio is naturally aligned in virtual address space.


>  	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
> -				      vmf->address, &vmf->ptl);
> +				       addr, &vmf->ptl);
>  	if (!vmf->pte)
>  		return VM_FAULT_NOPAGE;
>  
>  	/* Re-check under ptl */
> -	if (likely(!vmf_pte_changed(vmf))) {
> -		struct folio *folio = page_folio(page);
> -
> -		set_pte_range(vmf, folio, page, 1, vmf->address);
> -		ret = 0;
> -	} else {
> +	if (nr_pages == 1 && vmf_pte_changed(vmf)) {
>  		update_mmu_tlb(vma, vmf->address, vmf->pte);
>  		ret = VM_FAULT_NOPAGE;
> +		goto unlock;
> +	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {

I think you have grabbed this from do_anonymous_page()? But I'm not sure it
works in the same way here as it does there. For the anon case, if userfaultfd
is armed, alloc_anon_folio() will only ever allocate order-0. So we end up in
the vmf_pte_changed() path, which will allow overwriting a uffd entry. But here,
there is nothing stopping nr_pages being greater than 1 when there could be a
uffd entry present, and you will fail due to the pte_range_none() check. (see
pte_marker_handle_uffd_wp()).

Thanks,
Ryan

> +		for (i = 0; i < nr_pages; i++)
> +			update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
> +		ret = VM_FAULT_NOPAGE;
> +		goto unlock;
>  	}
>  
> +	set_pte_range(vmf, folio, &folio->page, nr_pages, addr);
> +	ret = 0;
> +
> +unlock:
>  	pte_unmap_unlock(vmf->pte, vmf->ptl);
>  	return ret;
>  }
Baolin Wang April 24, 2024, 3:23 a.m. UTC | #3
On 2024/4/23 19:03, Ryan Roberts wrote:
> On 22/04/2024 08:02, Baolin Wang wrote:
>> Add large folio mapping establishment support for finish_fault() as a preparation,
>> to support multi-size THP allocation of anonymous shared pages in the following
>> patches.
>>
>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>> ---
>>   mm/memory.c | 25 ++++++++++++++++++-------
>>   1 file changed, 18 insertions(+), 7 deletions(-)
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index b6fa5146b260..094a76730776 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4766,7 +4766,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>   {
>>   	struct vm_area_struct *vma = vmf->vma;
>>   	struct page *page;
>> +	struct folio *folio;
>>   	vm_fault_t ret;
>> +	int nr_pages, i;
>> +	unsigned long addr;
>>   
>>   	/* Did we COW the page? */
>>   	if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
>> @@ -4797,22 +4800,30 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>   			return VM_FAULT_OOM;
>>   	}
>>   
>> +	folio = page_folio(page);
>> +	nr_pages = folio_nr_pages(folio);
>> +	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
> 
> I'm not sure this is safe. IIUC, finish_fault() is called for any file-backed
> mapping. So you could have a situation where part of a (regular) file is mapped
> in the process, faults and hits in the pagecache. But the folio returned by the
> pagecache is bigger than the portion that the process has mapped. So you now end
> up mapping beyond the VMA limits? In the pagecache case, you also can't assume
> that the folio is naturally aligned in virtual address space.

Good point. Yes, I think you are right, I need consider the VMA limits, 
and I should refer to the calculations of the start pte and end pte in 
do_fault_around().

>>   	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>> -				      vmf->address, &vmf->ptl);
>> +				       addr, &vmf->ptl);
>>   	if (!vmf->pte)
>>   		return VM_FAULT_NOPAGE;
>>   
>>   	/* Re-check under ptl */
>> -	if (likely(!vmf_pte_changed(vmf))) {
>> -		struct folio *folio = page_folio(page);
>> -
>> -		set_pte_range(vmf, folio, page, 1, vmf->address);
>> -		ret = 0;
>> -	} else {
>> +	if (nr_pages == 1 && vmf_pte_changed(vmf)) {
>>   		update_mmu_tlb(vma, vmf->address, vmf->pte);
>>   		ret = VM_FAULT_NOPAGE;
>> +		goto unlock;
>> +	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
> 
> I think you have grabbed this from do_anonymous_page()? But I'm not sure it
> works in the same way here as it does there. For the anon case, if userfaultfd
> is armed, alloc_anon_folio() will only ever allocate order-0. So we end up in

IMO, the userfaultfd validation should do in the vma->vm_ops->fault() 
callback, to make sure the nr_pages is always 1 if userfaultfd is armed.

> the vmf_pte_changed() path, which will allow overwriting a uffd entry. But here,
> there is nothing stopping nr_pages being greater than 1 when there could be a
> uffd entry present, and you will fail due to the pte_range_none() check. (see
> pte_marker_handle_uffd_wp()).

So if we do the userfaultfd validation in ->fault() callback, then here 
we can use the same logic as with anonymous case.
Ryan Roberts April 24, 2024, 8:07 a.m. UTC | #4
On 24/04/2024 04:23, Baolin Wang wrote:
> 
> 
> On 2024/4/23 19:03, Ryan Roberts wrote:
>> On 22/04/2024 08:02, Baolin Wang wrote:
>>> Add large folio mapping establishment support for finish_fault() as a
>>> preparation,
>>> to support multi-size THP allocation of anonymous shared pages in the following
>>> patches.
>>>
>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>> ---
>>>   mm/memory.c | 25 ++++++++++++++++++-------
>>>   1 file changed, 18 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index b6fa5146b260..094a76730776 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -4766,7 +4766,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>   {
>>>       struct vm_area_struct *vma = vmf->vma;
>>>       struct page *page;
>>> +    struct folio *folio;
>>>       vm_fault_t ret;
>>> +    int nr_pages, i;
>>> +    unsigned long addr;
>>>         /* Did we COW the page? */
>>>       if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
>>> @@ -4797,22 +4800,30 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>               return VM_FAULT_OOM;
>>>       }
>>>   +    folio = page_folio(page);
>>> +    nr_pages = folio_nr_pages(folio);
>>> +    addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
>>
>> I'm not sure this is safe. IIUC, finish_fault() is called for any file-backed
>> mapping. So you could have a situation where part of a (regular) file is mapped
>> in the process, faults and hits in the pagecache. But the folio returned by the
>> pagecache is bigger than the portion that the process has mapped. So you now end
>> up mapping beyond the VMA limits? In the pagecache case, you also can't assume
>> that the folio is naturally aligned in virtual address space.
> 
> Good point. Yes, I think you are right, I need consider the VMA limits, and I
> should refer to the calculations of the start pte and end pte in do_fault_around().

You might also need to be careful not to increase reported RSS. I have a vague
recollection that David once mentioned a problem with fault-around because it
causes the reported RSS to increase for the process and this could lead to
different decisions in other places. IIRC Redhat had an advisory somewhere with
suggested workaround being to disable fault-around. For the anon-shared memory
case, it shouldn't be a problem because the user has opted into allocating
bigger blocks, but there may be a need to ensure we don't also start eagerly
mapping regular files beyond what fault-around is configured for.

> 
>>>       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>>> -                      vmf->address, &vmf->ptl);
>>> +                       addr, &vmf->ptl);
>>>       if (!vmf->pte)
>>>           return VM_FAULT_NOPAGE;
>>>         /* Re-check under ptl */
>>> -    if (likely(!vmf_pte_changed(vmf))) {
>>> -        struct folio *folio = page_folio(page);
>>> -
>>> -        set_pte_range(vmf, folio, page, 1, vmf->address);
>>> -        ret = 0;
>>> -    } else {
>>> +    if (nr_pages == 1 && vmf_pte_changed(vmf)) {
>>>           update_mmu_tlb(vma, vmf->address, vmf->pte);
>>>           ret = VM_FAULT_NOPAGE;
>>> +        goto unlock;
>>> +    } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
>>
>> I think you have grabbed this from do_anonymous_page()? But I'm not sure it
>> works in the same way here as it does there. For the anon case, if userfaultfd
>> is armed, alloc_anon_folio() will only ever allocate order-0. So we end up in
> 
> IMO, the userfaultfd validation should do in the vma->vm_ops->fault() callback,
> to make sure the nr_pages is always 1 if userfaultfd is armed.

OK. Are you saying there is already logic to do that today? Great!

> 
>> the vmf_pte_changed() path, which will allow overwriting a uffd entry. But here,
>> there is nothing stopping nr_pages being greater than 1 when there could be a
>> uffd entry present, and you will fail due to the pte_range_none() check. (see
>> pte_marker_handle_uffd_wp()).
> 
> So if we do the userfaultfd validation in ->fault() callback, then here we can
> use the same logic as with anonymous case.
Baolin Wang April 24, 2024, 9:26 a.m. UTC | #5
On 2024/4/24 16:07, Ryan Roberts wrote:
> On 24/04/2024 04:23, Baolin Wang wrote:
>>
>>
>> On 2024/4/23 19:03, Ryan Roberts wrote:
>>> On 22/04/2024 08:02, Baolin Wang wrote:
>>>> Add large folio mapping establishment support for finish_fault() as a
>>>> preparation,
>>>> to support multi-size THP allocation of anonymous shared pages in the following
>>>> patches.
>>>>
>>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>> ---
>>>>    mm/memory.c | 25 ++++++++++++++++++-------
>>>>    1 file changed, 18 insertions(+), 7 deletions(-)
>>>>
>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>> index b6fa5146b260..094a76730776 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -4766,7 +4766,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>    {
>>>>        struct vm_area_struct *vma = vmf->vma;
>>>>        struct page *page;
>>>> +    struct folio *folio;
>>>>        vm_fault_t ret;
>>>> +    int nr_pages, i;
>>>> +    unsigned long addr;
>>>>          /* Did we COW the page? */
>>>>        if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
>>>> @@ -4797,22 +4800,30 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>                return VM_FAULT_OOM;
>>>>        }
>>>>    +    folio = page_folio(page);
>>>> +    nr_pages = folio_nr_pages(folio);
>>>> +    addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
>>>
>>> I'm not sure this is safe. IIUC, finish_fault() is called for any file-backed
>>> mapping. So you could have a situation where part of a (regular) file is mapped
>>> in the process, faults and hits in the pagecache. But the folio returned by the
>>> pagecache is bigger than the portion that the process has mapped. So you now end
>>> up mapping beyond the VMA limits? In the pagecache case, you also can't assume
>>> that the folio is naturally aligned in virtual address space.
>>
>> Good point. Yes, I think you are right, I need consider the VMA limits, and I
>> should refer to the calculations of the start pte and end pte in do_fault_around().
> 
> You might also need to be careful not to increase reported RSS. I have a vague
> recollection that David once mentioned a problem with fault-around because it
> causes the reported RSS to increase for the process and this could lead to
> different decisions in other places. IIRC Redhat had an advisory somewhere with
> suggested workaround being to disable fault-around. For the anon-shared memory
> case, it shouldn't be a problem because the user has opted into allocating
> bigger blocks, but there may be a need to ensure we don't also start eagerly
> mapping regular files beyond what fault-around is configured for.

Thanks for reminding. And I also agree with you that this should not be 
a problem since user has selected the larger folio, which is not the 
same as fault-around.

>>>>        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>>>> -                      vmf->address, &vmf->ptl);
>>>> +                       addr, &vmf->ptl);
>>>>        if (!vmf->pte)
>>>>            return VM_FAULT_NOPAGE;
>>>>          /* Re-check under ptl */
>>>> -    if (likely(!vmf_pte_changed(vmf))) {
>>>> -        struct folio *folio = page_folio(page);
>>>> -
>>>> -        set_pte_range(vmf, folio, page, 1, vmf->address);
>>>> -        ret = 0;
>>>> -    } else {
>>>> +    if (nr_pages == 1 && vmf_pte_changed(vmf)) {
>>>>            update_mmu_tlb(vma, vmf->address, vmf->pte);
>>>>            ret = VM_FAULT_NOPAGE;
>>>> +        goto unlock;
>>>> +    } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
>>>
>>> I think you have grabbed this from do_anonymous_page()? But I'm not sure it
>>> works in the same way here as it does there. For the anon case, if userfaultfd
>>> is armed, alloc_anon_folio() will only ever allocate order-0. So we end up in
>>
>> IMO, the userfaultfd validation should do in the vma->vm_ops->fault() callback,
>> to make sure the nr_pages is always 1 if userfaultfd is armed.
> 
> OK. Are you saying there is already logic to do that today? Great!

I mean I should add the userfaultfd validation in shmem_fault(), and may 
be need add a warning in finish_fault() to catch this issue if other 
vma->vm_ops->fault() will support large folio allocation?

WARN_ON(nr_pages > 1 && userfaultfd_armed(vma));
Ryan Roberts April 24, 2024, 9:57 a.m. UTC | #6
On 24/04/2024 10:26, Baolin Wang wrote:
> 
> 
> On 2024/4/24 16:07, Ryan Roberts wrote:
>> On 24/04/2024 04:23, Baolin Wang wrote:
>>>
>>>
>>> On 2024/4/23 19:03, Ryan Roberts wrote:
>>>> On 22/04/2024 08:02, Baolin Wang wrote:
>>>>> Add large folio mapping establishment support for finish_fault() as a
>>>>> preparation,
>>>>> to support multi-size THP allocation of anonymous shared pages in the
>>>>> following
>>>>> patches.
>>>>>
>>>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>>> ---
>>>>>    mm/memory.c | 25 ++++++++++++++++++-------
>>>>>    1 file changed, 18 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>>> index b6fa5146b260..094a76730776 100644
>>>>> --- a/mm/memory.c
>>>>> +++ b/mm/memory.c
>>>>> @@ -4766,7 +4766,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>>    {
>>>>>        struct vm_area_struct *vma = vmf->vma;
>>>>>        struct page *page;
>>>>> +    struct folio *folio;
>>>>>        vm_fault_t ret;
>>>>> +    int nr_pages, i;
>>>>> +    unsigned long addr;
>>>>>          /* Did we COW the page? */
>>>>>        if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
>>>>> @@ -4797,22 +4800,30 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>>>>                return VM_FAULT_OOM;
>>>>>        }
>>>>>    +    folio = page_folio(page);
>>>>> +    nr_pages = folio_nr_pages(folio);
>>>>> +    addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
>>>>
>>>> I'm not sure this is safe. IIUC, finish_fault() is called for any file-backed
>>>> mapping. So you could have a situation where part of a (regular) file is mapped
>>>> in the process, faults and hits in the pagecache. But the folio returned by the
>>>> pagecache is bigger than the portion that the process has mapped. So you now
>>>> end
>>>> up mapping beyond the VMA limits? In the pagecache case, you also can't assume
>>>> that the folio is naturally aligned in virtual address space.
>>>
>>> Good point. Yes, I think you are right, I need consider the VMA limits, and I
>>> should refer to the calculations of the start pte and end pte in
>>> do_fault_around().
>>
>> You might also need to be careful not to increase reported RSS. I have a vague
>> recollection that David once mentioned a problem with fault-around because it
>> causes the reported RSS to increase for the process and this could lead to
>> different decisions in other places. IIRC Redhat had an advisory somewhere with
>> suggested workaround being to disable fault-around. For the anon-shared memory
>> case, it shouldn't be a problem because the user has opted into allocating
>> bigger blocks, but there may be a need to ensure we don't also start eagerly
>> mapping regular files beyond what fault-around is configured for.
> 
> Thanks for reminding. And I also agree with you that this should not be a
> problem since user has selected the larger folio, which is not the same as
> fault-around.
> 
>>>>>        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>>>>> -                      vmf->address, &vmf->ptl);
>>>>> +                       addr, &vmf->ptl);
>>>>>        if (!vmf->pte)
>>>>>            return VM_FAULT_NOPAGE;
>>>>>          /* Re-check under ptl */
>>>>> -    if (likely(!vmf_pte_changed(vmf))) {
>>>>> -        struct folio *folio = page_folio(page);
>>>>> -
>>>>> -        set_pte_range(vmf, folio, page, 1, vmf->address);
>>>>> -        ret = 0;
>>>>> -    } else {
>>>>> +    if (nr_pages == 1 && vmf_pte_changed(vmf)) {
>>>>>            update_mmu_tlb(vma, vmf->address, vmf->pte);
>>>>>            ret = VM_FAULT_NOPAGE;
>>>>> +        goto unlock;
>>>>> +    } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
>>>>
>>>> I think you have grabbed this from do_anonymous_page()? But I'm not sure it
>>>> works in the same way here as it does there. For the anon case, if userfaultfd
>>>> is armed, alloc_anon_folio() will only ever allocate order-0. So we end up in
>>>
>>> IMO, the userfaultfd validation should do in the vma->vm_ops->fault() callback,
>>> to make sure the nr_pages is always 1 if userfaultfd is armed.
>>
>> OK. Are you saying there is already logic to do that today? Great!
> 
> I mean I should add the userfaultfd validation in shmem_fault(), and may be need
> add a warning in finish_fault() to catch this issue if other
> vma->vm_ops->fault() will support large folio allocation?
> 
> WARN_ON(nr_pages > 1 && userfaultfd_armed(vma));

That adds quite a subtle requirement to vm_ops::fault() though, which I guess is
implemented in a lot of places. It would be better if it could be handled
centrally - i.e. that all the ptes are either none or a uffd marker? I'm sure
there would be corner cases to think about if taking that route.
Baolin Wang April 25, 2024, 7:04 a.m. UTC | #7
On 2024/4/23 16:39, Lance Yang wrote:
> Hey Baolin,
> 
> [...]
> @@ -4727,9 +4725,11 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
> [...]
> 
> The mm counters have been moved from updating out of set_pte_range(), so we
> we may need to rebase against Kefeng's patch[1].

Yes, I did not rebase on the new mm-unstable branch, and I will do so 
for the next formal patch set. Thanks.
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index b6fa5146b260..094a76730776 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4766,7 +4766,10 @@  vm_fault_t finish_fault(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	struct page *page;
+	struct folio *folio;
 	vm_fault_t ret;
+	int nr_pages, i;
+	unsigned long addr;
 
 	/* Did we COW the page? */
 	if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
@@ -4797,22 +4800,30 @@  vm_fault_t finish_fault(struct vm_fault *vmf)
 			return VM_FAULT_OOM;
 	}
 
+	folio = page_folio(page);
+	nr_pages = folio_nr_pages(folio);
+	addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
-				      vmf->address, &vmf->ptl);
+				       addr, &vmf->ptl);
 	if (!vmf->pte)
 		return VM_FAULT_NOPAGE;
 
 	/* Re-check under ptl */
-	if (likely(!vmf_pte_changed(vmf))) {
-		struct folio *folio = page_folio(page);
-
-		set_pte_range(vmf, folio, page, 1, vmf->address);
-		ret = 0;
-	} else {
+	if (nr_pages == 1 && vmf_pte_changed(vmf)) {
 		update_mmu_tlb(vma, vmf->address, vmf->pte);
 		ret = VM_FAULT_NOPAGE;
+		goto unlock;
+	} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
+		for (i = 0; i < nr_pages; i++)
+			update_mmu_tlb(vma, addr + PAGE_SIZE * i, vmf->pte + i);
+		ret = VM_FAULT_NOPAGE;
+		goto unlock;
 	}
 
+	set_pte_range(vmf, folio, &folio->page, nr_pages, addr);
+	ret = 0;
+
+unlock:
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
 	return ret;
 }