diff mbox series

[RFC,v3,3/4] mm: add do_set_pte_range()

Message ID 20230203131636.1648662-4-fengwei.yin@intel.com (mailing list archive)
State New
Headers show
Series folio based filemap_map_pages() | expand

Commit Message

Yin, Fengwei Feb. 3, 2023, 1:16 p.m. UTC
do_set_pte_range() allows to setup page table entries for a
specific range. It calls folio_add_file_rmap_range() to take
advantage of batched rmap update for large folio.

Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
---
 include/linux/mm.h |  3 +++
 mm/filemap.c       |  1 -
 mm/memory.c        | 59 ++++++++++++++++++++++++++++++----------------
 3 files changed, 42 insertions(+), 21 deletions(-)

Comments

David Hildenbrand Feb. 3, 2023, 1:25 p.m. UTC | #1
On 03.02.23 14:16, Yin Fengwei wrote:
> do_set_pte_range() allows to setup page table entries for a
> specific range. It calls folio_add_file_rmap_range() to take
> advantage of batched rmap update for large folio.
> 
> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
> ---
>   include/linux/mm.h |  3 +++
>   mm/filemap.c       |  1 -
>   mm/memory.c        | 59 ++++++++++++++++++++++++++++++----------------
>   3 files changed, 42 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index d6f8f41514cc..93192f04b276 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>   
>   vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
>   void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
> +		unsigned long addr, pte_t *pte,
> +		unsigned long start, unsigned int nr);
>   
>   vm_fault_t finish_fault(struct vm_fault *vmf);
>   vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
> diff --git a/mm/filemap.c b/mm/filemap.c
> index f444684db9f2..74046a3a0ff5 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>   
>   		ref_count++;
>   		do_set_pte(vmf, page, addr);
> -		update_mmu_cache(vma, addr, vmf->pte);
>   	} while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
>   
>   	/* Restore the vmf->pte */
> diff --git a/mm/memory.c b/mm/memory.c
> index 7a04a1130ec1..3754b2ef166a 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>   }
>   #endif
>   
> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
> +		unsigned long addr, pte_t *pte,
> +		unsigned long start, unsigned int nr)
>   {
>   	struct vm_area_struct *vma = vmf->vma;
>   	bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
>   	bool write = vmf->flags & FAULT_FLAG_WRITE;
> +	bool cow = write && !(vma->vm_flags & VM_SHARED);
>   	bool prefault = vmf->address != addr;
> +	struct page *page = folio_page(folio, start);
>   	pte_t entry;
>   
> -	flush_icache_page(vma, page);
> -	entry = mk_pte(page, vma->vm_page_prot);
> +	if (!cow) {
> +		folio_add_file_rmap_range(folio, start, nr, vma, false);
> +		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
> +	}
>   
> -	if (prefault && arch_wants_old_prefaulted_pte())
> -		entry = pte_mkold(entry);
> -	else
> -		entry = pte_sw_mkyoung(entry);
> +	do {
> +		flush_icache_page(vma, page);
> +		entry = mk_pte(page, vma->vm_page_prot);
>   
> -	if (write)
> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> -	if (unlikely(uffd_wp))
> -		entry = pte_mkuffd_wp(entry);
> -	/* copy-on-write page */
> -	if (write && !(vma->vm_flags & VM_SHARED)) {
> +		if (prefault && arch_wants_old_prefaulted_pte())
> +			entry = pte_mkold(entry);
> +		else
> +			entry = pte_sw_mkyoung(entry);
> +
> +		if (write)
> +			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> +		if (unlikely(uffd_wp))
> +			entry = pte_mkuffd_wp(entry);
> +		set_pte_at(vma->vm_mm, addr, pte, entry);
> +
> +		/* no need to invalidate: a not-present page won't be cached */
> +		update_mmu_cache(vma, addr, pte);
> +	} while (pte++, page++, addr += PAGE_SIZE, --nr > 0);
> +}
> +
> +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
> +{
> +	struct folio *folio = page_folio(page);
> +	struct vm_area_struct *vma = vmf->vma;
> +	bool cow = (vmf->flags & FAULT_FLAG_WRITE) &&
> +			!(vma->vm_flags & VM_SHARED);
> +
> +	if (cow) {
>   		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>   		page_add_new_anon_rmap(page, vma, addr);

As raised, we cannot PTE-map a multi-page folio that way.

This function only supports single-page anon folios.

page_add_new_anon_rmap() -> folio_add_new_anon_rmap(). As that documents:

"If the folio is large, it is accounted as a THP" -- for example, we 
would only increment the "entire mapcount" and set the PageAnonExclusive 
bit only on the head page.

So this really doesn't work for multi-page folios and if the function 
would be used for that, we'd be in trouble.

We'd want some fence here to detect that and bail out if we'd be 
instructed to do that. At least a WARN_ON_ONCE() I guess.
update_mmu_tlb(vma, vmf->address, vmf->pte);

Right now the function looks like it might just handle that.
Yin, Fengwei Feb. 3, 2023, 1:30 p.m. UTC | #2
On 2/3/2023 9:25 PM, David Hildenbrand wrote:
> On 03.02.23 14:16, Yin Fengwei wrote:
>> do_set_pte_range() allows to setup page table entries for a
>> specific range. It calls folio_add_file_rmap_range() to take
>> advantage of batched rmap update for large folio.
>>
>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>> ---
>>   include/linux/mm.h |  3 +++
>>   mm/filemap.c       |  1 -
>>   mm/memory.c        | 59 ++++++++++++++++++++++++++++++----------------
>>   3 files changed, 42 insertions(+), 21 deletions(-)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index d6f8f41514cc..93192f04b276 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>>     vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
>>   void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
>> +        unsigned long addr, pte_t *pte,
>> +        unsigned long start, unsigned int nr);
>>     vm_fault_t finish_fault(struct vm_fault *vmf);
>>   vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index f444684db9f2..74046a3a0ff5 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>>             ref_count++;
>>           do_set_pte(vmf, page, addr);
>> -        update_mmu_cache(vma, addr, vmf->pte);
>>       } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
>>         /* Restore the vmf->pte */
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 7a04a1130ec1..3754b2ef166a 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>>   }
>>   #endif
>>   -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
>> +        unsigned long addr, pte_t *pte,
>> +        unsigned long start, unsigned int nr)
>>   {
>>       struct vm_area_struct *vma = vmf->vma;
>>       bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
>>       bool write = vmf->flags & FAULT_FLAG_WRITE;
>> +    bool cow = write && !(vma->vm_flags & VM_SHARED);
>>       bool prefault = vmf->address != addr;
>> +    struct page *page = folio_page(folio, start);
>>       pte_t entry;
>>   -    flush_icache_page(vma, page);
>> -    entry = mk_pte(page, vma->vm_page_prot);
>> +    if (!cow) {
>> +        folio_add_file_rmap_range(folio, start, nr, vma, false);
>> +        add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
>> +    }
>>   -    if (prefault && arch_wants_old_prefaulted_pte())
>> -        entry = pte_mkold(entry);
>> -    else
>> -        entry = pte_sw_mkyoung(entry);
>> +    do {
>> +        flush_icache_page(vma, page);
>> +        entry = mk_pte(page, vma->vm_page_prot);
>>   -    if (write)
>> -        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>> -    if (unlikely(uffd_wp))
>> -        entry = pte_mkuffd_wp(entry);
>> -    /* copy-on-write page */
>> -    if (write && !(vma->vm_flags & VM_SHARED)) {
>> +        if (prefault && arch_wants_old_prefaulted_pte())
>> +            entry = pte_mkold(entry);
>> +        else
>> +            entry = pte_sw_mkyoung(entry);
>> +
>> +        if (write)
>> +            entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>> +        if (unlikely(uffd_wp))
>> +            entry = pte_mkuffd_wp(entry);
>> +        set_pte_at(vma->vm_mm, addr, pte, entry);
>> +
>> +        /* no need to invalidate: a not-present page won't be cached */
>> +        update_mmu_cache(vma, addr, pte);
>> +    } while (pte++, page++, addr += PAGE_SIZE, --nr > 0);
>> +}
>> +
>> +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>> +{
>> +    struct folio *folio = page_folio(page);
>> +    struct vm_area_struct *vma = vmf->vma;
>> +    bool cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>> +            !(vma->vm_flags & VM_SHARED);
>> +
>> +    if (cow) {
>>           inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>>           page_add_new_anon_rmap(page, vma, addr);
> 
> As raised, we cannot PTE-map a multi-page folio that way.
> 
> This function only supports single-page anon folios.
> 
> page_add_new_anon_rmap() -> folio_add_new_anon_rmap(). As that documents:
> 
> "If the folio is large, it is accounted as a THP" -- for example, we would only increment the "entire mapcount" and set the PageAnonExclusive bit only on the head page.
> 
> So this really doesn't work for multi-page folios and if the function would be used for that, we'd be in trouble.
> 
> We'd want some fence here to detect that and bail out if we'd be instructed to do that. At least a WARN_ON_ONCE() I guess.
> update_mmu_tlb(vma, vmf->address, vmf->pte);
> 
> Right now the function looks like it might just handle that.
You are right. I thought moving cow case out of it can make it explicit.
But looks like it doesn't. I will add WARN_ON_ONCE(). Thanks.


Regards
Yin, Fengwei

>
Chih-En Lin Feb. 3, 2023, 1:32 p.m. UTC | #3
On Fri, Feb 03, 2023 at 09:16:35PM +0800, Yin Fengwei wrote:
> do_set_pte_range() allows to setup page table entries for a
> specific range. It calls folio_add_file_rmap_range() to take
> advantage of batched rmap update for large folio.
> 
> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
> ---
>  include/linux/mm.h |  3 +++
>  mm/filemap.c       |  1 -
>  mm/memory.c        | 59 ++++++++++++++++++++++++++++++----------------
>  3 files changed, 42 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index d6f8f41514cc..93192f04b276 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>  
>  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
>  void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
> +		unsigned long addr, pte_t *pte,
> +		unsigned long start, unsigned int nr);
>  
>  vm_fault_t finish_fault(struct vm_fault *vmf);
>  vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
> diff --git a/mm/filemap.c b/mm/filemap.c
> index f444684db9f2..74046a3a0ff5 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>  
>  		ref_count++;
>  		do_set_pte(vmf, page, addr);
> -		update_mmu_cache(vma, addr, vmf->pte);
>  	} while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
>  
>  	/* Restore the vmf->pte */
> diff --git a/mm/memory.c b/mm/memory.c
> index 7a04a1130ec1..3754b2ef166a 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>  }
>  #endif
>  
> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
> +		unsigned long addr, pte_t *pte,
> +		unsigned long start, unsigned int nr)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
>  	bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
>  	bool write = vmf->flags & FAULT_FLAG_WRITE;
> +	bool cow = write && !(vma->vm_flags & VM_SHARED);

Why don't use is_cow_mapping()?
Is there anything messed up with VM_MAYWRITE?

>  	bool prefault = vmf->address != addr;
> +	struct page *page = folio_page(folio, start);
>  	pte_t entry;
>  
> -	flush_icache_page(vma, page);
> -	entry = mk_pte(page, vma->vm_page_prot);
> +	if (!cow) {
> +		folio_add_file_rmap_range(folio, start, nr, vma, false);
> +		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
> +	}
>  
> -	if (prefault && arch_wants_old_prefaulted_pte())
> -		entry = pte_mkold(entry);
> -	else
> -		entry = pte_sw_mkyoung(entry);
> +	do {
> +		flush_icache_page(vma, page);
> +		entry = mk_pte(page, vma->vm_page_prot);
>  
> -	if (write)
> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> -	if (unlikely(uffd_wp))
> -		entry = pte_mkuffd_wp(entry);
> -	/* copy-on-write page */
> -	if (write && !(vma->vm_flags & VM_SHARED)) {
> +		if (prefault && arch_wants_old_prefaulted_pte())
> +			entry = pte_mkold(entry);
> +		else
> +			entry = pte_sw_mkyoung(entry);
> +
> +		if (write)
> +			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> +		if (unlikely(uffd_wp))
> +			entry = pte_mkuffd_wp(entry);
> +		set_pte_at(vma->vm_mm, addr, pte, entry);
> +
> +		/* no need to invalidate: a not-present page won't be cached */
> +		update_mmu_cache(vma, addr, pte);
> +	} while (pte++, page++, addr += PAGE_SIZE, --nr > 0);
> +}
> +
> +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
> +{
> +	struct folio *folio = page_folio(page);
> +	struct vm_area_struct *vma = vmf->vma;
> +	bool cow = (vmf->flags & FAULT_FLAG_WRITE) &&
> +			!(vma->vm_flags & VM_SHARED);

Here too.

> +
> +	if (cow) {
>  		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>  		page_add_new_anon_rmap(page, vma, addr);
>  		lru_cache_add_inactive_or_unevictable(page, vma);
> -	} else {
> -		inc_mm_counter(vma->vm_mm, mm_counter_file(page));
> -		page_add_file_rmap(page, vma, false);
>  	}
> -	set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
> +
> +	do_set_pte_range(vmf, folio, addr, vmf->pte,
> +			folio_page_idx(folio, page), 1);
>  }
>  
>  static bool vmf_pte_changed(struct vm_fault *vmf)
> @@ -4361,9 +4383,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>  	if (likely(!vmf_pte_changed(vmf))) {
>  		do_set_pte(vmf, page, vmf->address);
>  
> -		/* no need to invalidate: a not-present page won't be cached */
> -		update_mmu_cache(vma, vmf->address, vmf->pte);
> -
>  		ret = 0;
>  	} else {
>  		update_mmu_tlb(vma, vmf->address, vmf->pte);
> -- 
> 2.30.2
> 
> 

Thanks,
Chih-En Lin
David Hildenbrand Feb. 3, 2023, 1:34 p.m. UTC | #4
On 03.02.23 14:30, Yin, Fengwei wrote:
> 
> 
> On 2/3/2023 9:25 PM, David Hildenbrand wrote:
>> On 03.02.23 14:16, Yin Fengwei wrote:
>>> do_set_pte_range() allows to setup page table entries for a
>>> specific range. It calls folio_add_file_rmap_range() to take
>>> advantage of batched rmap update for large folio.
>>>
>>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>>> ---
>>>    include/linux/mm.h |  3 +++
>>>    mm/filemap.c       |  1 -
>>>    mm/memory.c        | 59 ++++++++++++++++++++++++++++++----------------
>>>    3 files changed, 42 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>> index d6f8f41514cc..93192f04b276 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>>>      vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
>>>    void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
>>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
>>> +        unsigned long addr, pte_t *pte,
>>> +        unsigned long start, unsigned int nr);
>>>      vm_fault_t finish_fault(struct vm_fault *vmf);
>>>    vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
>>> diff --git a/mm/filemap.c b/mm/filemap.c
>>> index f444684db9f2..74046a3a0ff5 100644
>>> --- a/mm/filemap.c
>>> +++ b/mm/filemap.c
>>> @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>>>              ref_count++;
>>>            do_set_pte(vmf, page, addr);
>>> -        update_mmu_cache(vma, addr, vmf->pte);
>>>        } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
>>>          /* Restore the vmf->pte */
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index 7a04a1130ec1..3754b2ef166a 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>>>    }
>>>    #endif
>>>    -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
>>> +        unsigned long addr, pte_t *pte,
>>> +        unsigned long start, unsigned int nr)
>>>    {
>>>        struct vm_area_struct *vma = vmf->vma;
>>>        bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
>>>        bool write = vmf->flags & FAULT_FLAG_WRITE;
>>> +    bool cow = write && !(vma->vm_flags & VM_SHARED);
>>>        bool prefault = vmf->address != addr;
>>> +    struct page *page = folio_page(folio, start);
>>>        pte_t entry;
>>>    -    flush_icache_page(vma, page);
>>> -    entry = mk_pte(page, vma->vm_page_prot);
>>> +    if (!cow) {
>>> +        folio_add_file_rmap_range(folio, start, nr, vma, false);
>>> +        add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
>>> +    }
>>>    -    if (prefault && arch_wants_old_prefaulted_pte())
>>> -        entry = pte_mkold(entry);
>>> -    else
>>> -        entry = pte_sw_mkyoung(entry);
>>> +    do {
>>> +        flush_icache_page(vma, page);
>>> +        entry = mk_pte(page, vma->vm_page_prot);
>>>    -    if (write)
>>> -        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>>> -    if (unlikely(uffd_wp))
>>> -        entry = pte_mkuffd_wp(entry);
>>> -    /* copy-on-write page */
>>> -    if (write && !(vma->vm_flags & VM_SHARED)) {
>>> +        if (prefault && arch_wants_old_prefaulted_pte())
>>> +            entry = pte_mkold(entry);
>>> +        else
>>> +            entry = pte_sw_mkyoung(entry);
>>> +
>>> +        if (write)
>>> +            entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>>> +        if (unlikely(uffd_wp))
>>> +            entry = pte_mkuffd_wp(entry);
>>> +        set_pte_at(vma->vm_mm, addr, pte, entry);
>>> +
>>> +        /* no need to invalidate: a not-present page won't be cached */
>>> +        update_mmu_cache(vma, addr, pte);
>>> +    } while (pte++, page++, addr += PAGE_SIZE, --nr > 0);
>>> +}
>>> +
>>> +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>> +{
>>> +    struct folio *folio = page_folio(page);
>>> +    struct vm_area_struct *vma = vmf->vma;
>>> +    bool cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>>> +            !(vma->vm_flags & VM_SHARED);
>>> +
>>> +    if (cow) {
>>>            inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>>>            page_add_new_anon_rmap(page, vma, addr);
>>
>> As raised, we cannot PTE-map a multi-page folio that way.
>>
>> This function only supports single-page anon folios.
>>
>> page_add_new_anon_rmap() -> folio_add_new_anon_rmap(). As that documents:
>>
>> "If the folio is large, it is accounted as a THP" -- for example, we would only increment the "entire mapcount" and set the PageAnonExclusive bit only on the head page.
>>
>> So this really doesn't work for multi-page folios and if the function would be used for that, we'd be in trouble.
>>
>> We'd want some fence here to detect that and bail out if we'd be instructed to do that. At least a WARN_ON_ONCE() I guess.
>> update_mmu_tlb(vma, vmf->address, vmf->pte);
>>
>> Right now the function looks like it might just handle that.
> You are right. I thought moving cow case out of it can make it explicit.
> But looks like it doesn't. I will add WARN_ON_ONCE(). Thanks.

I guess I would move the cow check into  do_set_pte_range() as well, and 
verify in there that we are really only dealing with a single-page 
folio, commenting that rmap code would need serious adjustment to make 
it work and that current code never passes a multi-page folio.

Thanks!
Yin, Fengwei Feb. 3, 2023, 1:38 p.m. UTC | #5
On 2/3/2023 9:32 PM, Chih-En Lin wrote:
> On Fri, Feb 03, 2023 at 09:16:35PM +0800, Yin Fengwei wrote:
>> do_set_pte_range() allows to setup page table entries for a
>> specific range. It calls folio_add_file_rmap_range() to take
>> advantage of batched rmap update for large folio.
>>
>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>> ---
>>  include/linux/mm.h |  3 +++
>>  mm/filemap.c       |  1 -
>>  mm/memory.c        | 59 ++++++++++++++++++++++++++++++----------------
>>  3 files changed, 42 insertions(+), 21 deletions(-)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index d6f8f41514cc..93192f04b276 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>>  
>>  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
>>  void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
>> +		unsigned long addr, pte_t *pte,
>> +		unsigned long start, unsigned int nr);
>>  
>>  vm_fault_t finish_fault(struct vm_fault *vmf);
>>  vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index f444684db9f2..74046a3a0ff5 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>>  
>>  		ref_count++;
>>  		do_set_pte(vmf, page, addr);
>> -		update_mmu_cache(vma, addr, vmf->pte);
>>  	} while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
>>  
>>  	/* Restore the vmf->pte */
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 7a04a1130ec1..3754b2ef166a 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>>  }
>>  #endif
>>  
>> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
>> +		unsigned long addr, pte_t *pte,
>> +		unsigned long start, unsigned int nr)
>>  {
>>  	struct vm_area_struct *vma = vmf->vma;
>>  	bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
>>  	bool write = vmf->flags & FAULT_FLAG_WRITE;
>> +	bool cow = write && !(vma->vm_flags & VM_SHARED);
> 
> Why don't use is_cow_mapping()?
> Is there anything messed up with VM_MAYWRITE?
My understanding is it's not related with the mapping. It's related with
what operation triggers fault here. Say, if it's a writable mapping, and
if the read operation triggers fault here, no cow or maybe_mkwrite() needed
here. Thanks.


Regards
Yin, Fengwei

> 
>>  	bool prefault = vmf->address != addr;
>> +	struct page *page = folio_page(folio, start);
>>  	pte_t entry;
>>  
>> -	flush_icache_page(vma, page);
>> -	entry = mk_pte(page, vma->vm_page_prot);
>> +	if (!cow) {
>> +		folio_add_file_rmap_range(folio, start, nr, vma, false);
>> +		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
>> +	}
>>  
>> -	if (prefault && arch_wants_old_prefaulted_pte())
>> -		entry = pte_mkold(entry);
>> -	else
>> -		entry = pte_sw_mkyoung(entry);
>> +	do {
>> +		flush_icache_page(vma, page);
>> +		entry = mk_pte(page, vma->vm_page_prot);
>>  
>> -	if (write)
>> -		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>> -	if (unlikely(uffd_wp))
>> -		entry = pte_mkuffd_wp(entry);
>> -	/* copy-on-write page */
>> -	if (write && !(vma->vm_flags & VM_SHARED)) {
>> +		if (prefault && arch_wants_old_prefaulted_pte())
>> +			entry = pte_mkold(entry);
>> +		else
>> +			entry = pte_sw_mkyoung(entry);
>> +
>> +		if (write)
>> +			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>> +		if (unlikely(uffd_wp))
>> +			entry = pte_mkuffd_wp(entry);
>> +		set_pte_at(vma->vm_mm, addr, pte, entry);
>> +
>> +		/* no need to invalidate: a not-present page won't be cached */
>> +		update_mmu_cache(vma, addr, pte);
>> +	} while (pte++, page++, addr += PAGE_SIZE, --nr > 0);
>> +}
>> +
>> +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>> +{
>> +	struct folio *folio = page_folio(page);
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	bool cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>> +			!(vma->vm_flags & VM_SHARED);
> 
> Here too.
> 
>> +
>> +	if (cow) {
>>  		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>>  		page_add_new_anon_rmap(page, vma, addr);
>>  		lru_cache_add_inactive_or_unevictable(page, vma);
>> -	} else {
>> -		inc_mm_counter(vma->vm_mm, mm_counter_file(page));
>> -		page_add_file_rmap(page, vma, false);
>>  	}
>> -	set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
>> +
>> +	do_set_pte_range(vmf, folio, addr, vmf->pte,
>> +			folio_page_idx(folio, page), 1);
>>  }
>>  
>>  static bool vmf_pte_changed(struct vm_fault *vmf)
>> @@ -4361,9 +4383,6 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
>>  	if (likely(!vmf_pte_changed(vmf))) {
>>  		do_set_pte(vmf, page, vmf->address);
>>  
>> -		/* no need to invalidate: a not-present page won't be cached */
>> -		update_mmu_cache(vma, vmf->address, vmf->pte);
>> -
>>  		ret = 0;
>>  	} else {
>>  		update_mmu_tlb(vma, vmf->address, vmf->pte);
>> -- 
>> 2.30.2
>>
>>
> 
> Thanks,
> Chih-En Lin
>
Yin, Fengwei Feb. 3, 2023, 1:39 p.m. UTC | #6
On 2/3/2023 9:34 PM, David Hildenbrand wrote:
> On 03.02.23 14:30, Yin, Fengwei wrote:
>>
>>
>> On 2/3/2023 9:25 PM, David Hildenbrand wrote:
>>> On 03.02.23 14:16, Yin Fengwei wrote:
>>>> do_set_pte_range() allows to setup page table entries for a
>>>> specific range. It calls folio_add_file_rmap_range() to take
>>>> advantage of batched rmap update for large folio.
>>>>
>>>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>>>> ---
>>>>    include/linux/mm.h |  3 +++
>>>>    mm/filemap.c       |  1 -
>>>>    mm/memory.c        | 59 ++++++++++++++++++++++++++++++----------------
>>>>    3 files changed, 42 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>> index d6f8f41514cc..93192f04b276 100644
>>>> --- a/include/linux/mm.h
>>>> +++ b/include/linux/mm.h
>>>> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>>>>      vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
>>>>    void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
>>>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
>>>> +        unsigned long addr, pte_t *pte,
>>>> +        unsigned long start, unsigned int nr);
>>>>      vm_fault_t finish_fault(struct vm_fault *vmf);
>>>>    vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
>>>> diff --git a/mm/filemap.c b/mm/filemap.c
>>>> index f444684db9f2..74046a3a0ff5 100644
>>>> --- a/mm/filemap.c
>>>> +++ b/mm/filemap.c
>>>> @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>>>>              ref_count++;
>>>>            do_set_pte(vmf, page, addr);
>>>> -        update_mmu_cache(vma, addr, vmf->pte);
>>>>        } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
>>>>          /* Restore the vmf->pte */
>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>> index 7a04a1130ec1..3754b2ef166a 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>>>>    }
>>>>    #endif
>>>>    -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
>>>> +        unsigned long addr, pte_t *pte,
>>>> +        unsigned long start, unsigned int nr)
>>>>    {
>>>>        struct vm_area_struct *vma = vmf->vma;
>>>>        bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
>>>>        bool write = vmf->flags & FAULT_FLAG_WRITE;
>>>> +    bool cow = write && !(vma->vm_flags & VM_SHARED);
>>>>        bool prefault = vmf->address != addr;
>>>> +    struct page *page = folio_page(folio, start);
>>>>        pte_t entry;
>>>>    -    flush_icache_page(vma, page);
>>>> -    entry = mk_pte(page, vma->vm_page_prot);
>>>> +    if (!cow) {
>>>> +        folio_add_file_rmap_range(folio, start, nr, vma, false);
>>>> +        add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
>>>> +    }
>>>>    -    if (prefault && arch_wants_old_prefaulted_pte())
>>>> -        entry = pte_mkold(entry);
>>>> -    else
>>>> -        entry = pte_sw_mkyoung(entry);
>>>> +    do {
>>>> +        flush_icache_page(vma, page);
>>>> +        entry = mk_pte(page, vma->vm_page_prot);
>>>>    -    if (write)
>>>> -        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>>>> -    if (unlikely(uffd_wp))
>>>> -        entry = pte_mkuffd_wp(entry);
>>>> -    /* copy-on-write page */
>>>> -    if (write && !(vma->vm_flags & VM_SHARED)) {
>>>> +        if (prefault && arch_wants_old_prefaulted_pte())
>>>> +            entry = pte_mkold(entry);
>>>> +        else
>>>> +            entry = pte_sw_mkyoung(entry);
>>>> +
>>>> +        if (write)
>>>> +            entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>>>> +        if (unlikely(uffd_wp))
>>>> +            entry = pte_mkuffd_wp(entry);
>>>> +        set_pte_at(vma->vm_mm, addr, pte, entry);
>>>> +
>>>> +        /* no need to invalidate: a not-present page won't be cached */
>>>> +        update_mmu_cache(vma, addr, pte);
>>>> +    } while (pte++, page++, addr += PAGE_SIZE, --nr > 0);
>>>> +}
>>>> +
>>>> +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>>> +{
>>>> +    struct folio *folio = page_folio(page);
>>>> +    struct vm_area_struct *vma = vmf->vma;
>>>> +    bool cow = (vmf->flags & FAULT_FLAG_WRITE) &&
>>>> +            !(vma->vm_flags & VM_SHARED);
>>>> +
>>>> +    if (cow) {
>>>>            inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
>>>>            page_add_new_anon_rmap(page, vma, addr);
>>>
>>> As raised, we cannot PTE-map a multi-page folio that way.
>>>
>>> This function only supports single-page anon folios.
>>>
>>> page_add_new_anon_rmap() -> folio_add_new_anon_rmap(). As that documents:
>>>
>>> "If the folio is large, it is accounted as a THP" -- for example, we would only increment the "entire mapcount" and set the PageAnonExclusive bit only on the head page.
>>>
>>> So this really doesn't work for multi-page folios and if the function would be used for that, we'd be in trouble.
>>>
>>> We'd want some fence here to detect that and bail out if we'd be instructed to do that. At least a WARN_ON_ONCE() I guess.
>>> update_mmu_tlb(vma, vmf->address, vmf->pte);
>>>
>>> Right now the function looks like it might just handle that.
>> You are right. I thought moving cow case out of it can make it explicit.
>> But looks like it doesn't. I will add WARN_ON_ONCE(). Thanks.
> 
> I guess I would move the cow check into  do_set_pte_range() as well, and verify in there that we are really only dealing with a single-page folio, commenting that rmap code would need serious adjustment to make it work and that current code never passes a multi-page folio.

Yes. There is !cow in do_set_pte_range() already. I planned to add
WARN_ON_ONCE() on else case there. Thanks.


Regards
Yin, Fengwei

> 
> Thanks!
>
Chih-En Lin Feb. 3, 2023, 2:30 p.m. UTC | #7
On Fri, Feb 03, 2023 at 09:38:15PM +0800, Yin, Fengwei wrote:
> 
> 
> On 2/3/2023 9:32 PM, Chih-En Lin wrote:
> > On Fri, Feb 03, 2023 at 09:16:35PM +0800, Yin Fengwei wrote:
> >> do_set_pte_range() allows to setup page table entries for a
> >> specific range. It calls folio_add_file_rmap_range() to take
> >> advantage of batched rmap update for large folio.
> >>
> >> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
> >> ---
> >>  include/linux/mm.h |  3 +++
> >>  mm/filemap.c       |  1 -
> >>  mm/memory.c        | 59 ++++++++++++++++++++++++++++++----------------
> >>  3 files changed, 42 insertions(+), 21 deletions(-)
> >>
> >> diff --git a/include/linux/mm.h b/include/linux/mm.h
> >> index d6f8f41514cc..93192f04b276 100644
> >> --- a/include/linux/mm.h
> >> +++ b/include/linux/mm.h
> >> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
> >>  
> >>  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
> >>  void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
> >> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
> >> +		unsigned long addr, pte_t *pte,
> >> +		unsigned long start, unsigned int nr);
> >>  
> >>  vm_fault_t finish_fault(struct vm_fault *vmf);
> >>  vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
> >> diff --git a/mm/filemap.c b/mm/filemap.c
> >> index f444684db9f2..74046a3a0ff5 100644
> >> --- a/mm/filemap.c
> >> +++ b/mm/filemap.c
> >> @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
> >>  
> >>  		ref_count++;
> >>  		do_set_pte(vmf, page, addr);
> >> -		update_mmu_cache(vma, addr, vmf->pte);
> >>  	} while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
> >>  
> >>  	/* Restore the vmf->pte */
> >> diff --git a/mm/memory.c b/mm/memory.c
> >> index 7a04a1130ec1..3754b2ef166a 100644
> >> --- a/mm/memory.c
> >> +++ b/mm/memory.c
> >> @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
> >>  }
> >>  #endif
> >>  
> >> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
> >> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
> >> +		unsigned long addr, pte_t *pte,
> >> +		unsigned long start, unsigned int nr)
> >>  {
> >>  	struct vm_area_struct *vma = vmf->vma;
> >>  	bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
> >>  	bool write = vmf->flags & FAULT_FLAG_WRITE;
> >> +	bool cow = write && !(vma->vm_flags & VM_SHARED);
> > 
> > Why don't use is_cow_mapping()?
> > Is there anything messed up with VM_MAYWRITE?
> My understanding is it's not related with the mapping. It's related with
> what operation triggers fault here. Say, if it's a writable mapping, and
> if the read operation triggers fault here, no cow or maybe_mkwrite() needed
> here. Thanks.

Sorry, I didn't describe the thing properly.
It makes sense for the relationship with the write/read fault.
I'm just wondering if "!(vma->vm_flags & VM_SHARED)" is enough to determine
the COW page? And, I also found it in do_fault().

Like, copy_present_pte() use is_cow_mapping() for COW page and
"vm_flags & VM_SHARED" for shared mapping.

So, I looked up the commit that introduced the is_cow_mapping(),
67121172f9753 ("Allow arbitrary read-only shared pfn-remapping too").

Here is the commit message:
"
    The VM layer (for historical reasons) turns a read-only shared mmap into
    a private-like mapping with the VM_MAYWRITE bit clear.  Thus checking
    just VM_SHARED isn't actually sufficient.
    
    So use a trivial helper function for the cases where we wanted to inquire
    if a mapping was COW-like or not.
"

hmm, but it is v2.6.15.
So is "vm_flags & VM_SHARED" enough to check the COW page now?

Thanks,
Chih-En Lin
Yin, Fengwei Feb. 4, 2023, 5:47 a.m. UTC | #8
On 2/3/2023 10:30 PM, Chih-En Lin wrote:
> On Fri, Feb 03, 2023 at 09:38:15PM +0800, Yin, Fengwei wrote:
>>
>>
>> On 2/3/2023 9:32 PM, Chih-En Lin wrote:
>>> On Fri, Feb 03, 2023 at 09:16:35PM +0800, Yin Fengwei wrote:
>>>> do_set_pte_range() allows to setup page table entries for a
>>>> specific range. It calls folio_add_file_rmap_range() to take
>>>> advantage of batched rmap update for large folio.
>>>>
>>>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>>>> ---
>>>>  include/linux/mm.h |  3 +++
>>>>  mm/filemap.c       |  1 -
>>>>  mm/memory.c        | 59 ++++++++++++++++++++++++++++++----------------
>>>>  3 files changed, 42 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>> index d6f8f41514cc..93192f04b276 100644
>>>> --- a/include/linux/mm.h
>>>> +++ b/include/linux/mm.h
>>>> @@ -1162,6 +1162,9 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
>>>>  
>>>>  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
>>>>  void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
>>>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
>>>> +		unsigned long addr, pte_t *pte,
>>>> +		unsigned long start, unsigned int nr);
>>>>  
>>>>  vm_fault_t finish_fault(struct vm_fault *vmf);
>>>>  vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
>>>> diff --git a/mm/filemap.c b/mm/filemap.c
>>>> index f444684db9f2..74046a3a0ff5 100644
>>>> --- a/mm/filemap.c
>>>> +++ b/mm/filemap.c
>>>> @@ -3386,7 +3386,6 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>>>>  
>>>>  		ref_count++;
>>>>  		do_set_pte(vmf, page, addr);
>>>> -		update_mmu_cache(vma, addr, vmf->pte);
>>>>  	} while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
>>>>  
>>>>  	/* Restore the vmf->pte */
>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>> index 7a04a1130ec1..3754b2ef166a 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -4257,36 +4257,58 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
>>>>  }
>>>>  #endif
>>>>  
>>>> -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
>>>> +void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
>>>> +		unsigned long addr, pte_t *pte,
>>>> +		unsigned long start, unsigned int nr)
>>>>  {
>>>>  	struct vm_area_struct *vma = vmf->vma;
>>>>  	bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
>>>>  	bool write = vmf->flags & FAULT_FLAG_WRITE;
>>>> +	bool cow = write && !(vma->vm_flags & VM_SHARED);
>>>
>>> Why don't use is_cow_mapping()?
>>> Is there anything messed up with VM_MAYWRITE?
>> My understanding is it's not related with the mapping. It's related with
>> what operation triggers fault here. Say, if it's a writable mapping, and
>> if the read operation triggers fault here, no cow or maybe_mkwrite() needed
>> here. Thanks.
> 
> Sorry, I didn't describe the thing properly.
> It makes sense for the relationship with the write/read fault.
> I'm just wondering if "!(vma->vm_flags & VM_SHARED)" is enough to determine
> the COW page? And, I also found it in do_fault().
> 
> Like, copy_present_pte() use is_cow_mapping() for COW page and
> "vm_flags & VM_SHARED" for shared mapping.
> 
> So, I looked up the commit that introduced the is_cow_mapping(),
> 67121172f9753 ("Allow arbitrary read-only shared pfn-remapping too").
> 
> Here is the commit message:
> "
>     The VM layer (for historical reasons) turns a read-only shared mmap into
>     a private-like mapping with the VM_MAYWRITE bit clear.  Thus checking
>     just VM_SHARED isn't actually sufficient.
>     
>     So use a trivial helper function for the cases where we wanted to inquire
>     if a mapping was COW-like or not.
> "
> 
> hmm, but it is v2.6.15.
> So is "vm_flags & VM_SHARED" enough to check the COW page now?
Thanks for the detail info here. Yes. VM_MAYWRITE bit of vma->vm_flags needs
be checked for COW also.

In the page fault path, the VM_MAYWRITE bit was checked in sanitize_fault_flags():
                /* Write faults on read-only mappings are impossible ... */
                if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)))
                        return VM_FAULT_SIGSEGV;

and bail out early if it's write fault and no VM_MAYWRITE.

My understanding is that sanitize_fault_flags() is called first before hit
do_set_pte()/do_set_pte_range().


Regards
Yin, Fengwei

> 
> Thanks,
> Chih-En Lin
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d6f8f41514cc..93192f04b276 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1162,6 +1162,9 @@  static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 
 vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
 void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
+void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
+		unsigned long addr, pte_t *pte,
+		unsigned long start, unsigned int nr);
 
 vm_fault_t finish_fault(struct vm_fault *vmf);
 vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
diff --git a/mm/filemap.c b/mm/filemap.c
index f444684db9f2..74046a3a0ff5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3386,7 +3386,6 @@  static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
 
 		ref_count++;
 		do_set_pte(vmf, page, addr);
-		update_mmu_cache(vma, addr, vmf->pte);
 	} while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages);
 
 	/* Restore the vmf->pte */
diff --git a/mm/memory.c b/mm/memory.c
index 7a04a1130ec1..3754b2ef166a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4257,36 +4257,58 @@  vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
 }
 #endif
 
-void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
+void do_set_pte_range(struct vm_fault *vmf, struct folio *folio,
+		unsigned long addr, pte_t *pte,
+		unsigned long start, unsigned int nr)
 {
 	struct vm_area_struct *vma = vmf->vma;
 	bool uffd_wp = pte_marker_uffd_wp(vmf->orig_pte);
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
+	bool cow = write && !(vma->vm_flags & VM_SHARED);
 	bool prefault = vmf->address != addr;
+	struct page *page = folio_page(folio, start);
 	pte_t entry;
 
-	flush_icache_page(vma, page);
-	entry = mk_pte(page, vma->vm_page_prot);
+	if (!cow) {
+		folio_add_file_rmap_range(folio, start, nr, vma, false);
+		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
+	}
 
-	if (prefault && arch_wants_old_prefaulted_pte())
-		entry = pte_mkold(entry);
-	else
-		entry = pte_sw_mkyoung(entry);
+	do {
+		flush_icache_page(vma, page);
+		entry = mk_pte(page, vma->vm_page_prot);
 
-	if (write)
-		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-	if (unlikely(uffd_wp))
-		entry = pte_mkuffd_wp(entry);
-	/* copy-on-write page */
-	if (write && !(vma->vm_flags & VM_SHARED)) {
+		if (prefault && arch_wants_old_prefaulted_pte())
+			entry = pte_mkold(entry);
+		else
+			entry = pte_sw_mkyoung(entry);
+
+		if (write)
+			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		if (unlikely(uffd_wp))
+			entry = pte_mkuffd_wp(entry);
+		set_pte_at(vma->vm_mm, addr, pte, entry);
+
+		/* no need to invalidate: a not-present page won't be cached */
+		update_mmu_cache(vma, addr, pte);
+	} while (pte++, page++, addr += PAGE_SIZE, --nr > 0);
+}
+
+void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
+{
+	struct folio *folio = page_folio(page);
+	struct vm_area_struct *vma = vmf->vma;
+	bool cow = (vmf->flags & FAULT_FLAG_WRITE) &&
+			!(vma->vm_flags & VM_SHARED);
+
+	if (cow) {
 		inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
 		page_add_new_anon_rmap(page, vma, addr);
 		lru_cache_add_inactive_or_unevictable(page, vma);
-	} else {
-		inc_mm_counter(vma->vm_mm, mm_counter_file(page));
-		page_add_file_rmap(page, vma, false);
 	}
-	set_pte_at(vma->vm_mm, addr, vmf->pte, entry);
+
+	do_set_pte_range(vmf, folio, addr, vmf->pte,
+			folio_page_idx(folio, page), 1);
 }
 
 static bool vmf_pte_changed(struct vm_fault *vmf)
@@ -4361,9 +4383,6 @@  vm_fault_t finish_fault(struct vm_fault *vmf)
 	if (likely(!vmf_pte_changed(vmf))) {
 		do_set_pte(vmf, page, vmf->address);
 
-		/* no need to invalidate: a not-present page won't be cached */
-		update_mmu_cache(vma, vmf->address, vmf->pte);
-
 		ret = 0;
 	} else {
 		update_mmu_tlb(vma, vmf->address, vmf->pte);