diff mbox series

[v4,34/36] rmap: add folio_add_file_rmap_range()

Message ID 20230315051444.3229621-35-willy@infradead.org (mailing list archive)
State New
Headers show
Series New page table range API | expand

Commit Message

Matthew Wilcox March 15, 2023, 5:14 a.m. UTC
From: Yin Fengwei <fengwei.yin@intel.com>

folio_add_file_rmap_range() allows to add pte mapping to a specific
range of file folio. Comparing to page_add_file_rmap(), it batched
updates __lruvec_stat for large folio.

Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/rmap.h |  2 ++
 mm/rmap.c            | 60 +++++++++++++++++++++++++++++++++-----------
 2 files changed, 48 insertions(+), 14 deletions(-)

Comments

Ryan Roberts March 15, 2023, 1:34 p.m. UTC | #1
On 15/03/2023 05:14, Matthew Wilcox (Oracle) wrote:
> From: Yin Fengwei <fengwei.yin@intel.com>
> 
> folio_add_file_rmap_range() allows to add pte mapping to a specific
> range of file folio. Comparing to page_add_file_rmap(), it batched
> updates __lruvec_stat for large folio.
> 
> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  include/linux/rmap.h |  2 ++
>  mm/rmap.c            | 60 +++++++++++++++++++++++++++++++++-----------
>  2 files changed, 48 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index b87d01660412..a3825ce81102 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -198,6 +198,8 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
>  		unsigned long address);
>  void page_add_file_rmap(struct page *, struct vm_area_struct *,
>  		bool compound);
> +void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
> +		struct vm_area_struct *, bool compound);
>  void page_remove_rmap(struct page *, struct vm_area_struct *,
>  		bool compound);
>  
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 4898e10c569a..a91906b28835 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1301,31 +1301,39 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>  }
>  
>  /**
> - * page_add_file_rmap - add pte mapping to a file page
> - * @page:	the page to add the mapping to
> + * folio_add_file_rmap_range - add pte mapping to page range of a folio
> + * @folio:	The folio to add the mapping to
> + * @page:	The first page to add
> + * @nr_pages:	The number of pages which will be mapped
>   * @vma:	the vm area in which the mapping is added
>   * @compound:	charge the page as compound or small page
>   *
> + * The page range of folio is defined by [first_page, first_page + nr_pages)
> + *
>   * The caller needs to hold the pte lock.
>   */
> -void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
> -		bool compound)
> +void folio_add_file_rmap_range(struct folio *folio, struct page *page,
> +			unsigned int nr_pages, struct vm_area_struct *vma,
> +			bool compound)
>  {
> -	struct folio *folio = page_folio(page);
>  	atomic_t *mapped = &folio->_nr_pages_mapped;
> -	int nr = 0, nr_pmdmapped = 0;
> -	bool first;
> +	unsigned int nr_pmdmapped = 0, first;
> +	int nr = 0;
>  
> -	VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
> +	VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio);
>  
>  	/* Is page being mapped by PTE? Is this its first map to be added? */
>  	if (likely(!compound)) {
> -		first = atomic_inc_and_test(&page->_mapcount);
> -		nr = first;
> -		if (first && folio_test_large(folio)) {
> -			nr = atomic_inc_return_relaxed(mapped);
> -			nr = (nr < COMPOUND_MAPPED);
> -		}
> +		do {
> +			first = atomic_inc_and_test(&page->_mapcount);
> +			if (first && folio_test_large(folio)) {
> +				first = atomic_inc_return_relaxed(mapped);
> +				first = (nr < COMPOUND_MAPPED);

This still contains the typo that Yin Fengwei spotted in the previous version:
https://lore.kernel.org/linux-mm/20230228213738.272178-1-willy@infradead.org/T/#m84673899e25bc31356093a1177941f2cc35e5da8

FYI, I'm seeing a perf regression of about 1% when compiling the kernel on
Ampere Altra (arm64) with this whole series on top of v6.3-rc1 (In a VM using
ext4 filesystem). Looks like instruction aborts are taking much longer and a
selection of syscalls are a bit slower. Still hunting down the root cause. Will
report once I have conclusive diagnosis.

Thanks,
Ryan


> +			}
> +
> +			if (first)
> +				nr++;
> +		} while (page++, --nr_pages > 0);
>  	} else if (folio_test_pmd_mappable(folio)) {
>  		/* That test is redundant: it's for safety or to optimize out */
>  
> @@ -1354,6 +1362,30 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>  	mlock_vma_folio(folio, vma, compound);
>  }
>  
> +/**
> + * page_add_file_rmap - add pte mapping to a file page
> + * @page:	the page to add the mapping to
> + * @vma:	the vm area in which the mapping is added
> + * @compound:	charge the page as compound or small page
> + *
> + * The caller needs to hold the pte lock.
> + */
> +void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
> +		bool compound)
> +{
> +	struct folio *folio = page_folio(page);
> +	unsigned int nr_pages;
> +
> +	VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page);
> +
> +	if (likely(!compound))
> +		nr_pages = 1;
> +	else
> +		nr_pages = folio_nr_pages(folio);
> +
> +	folio_add_file_rmap_range(folio, page, nr_pages, vma, compound);
> +}
> +
>  /**
>   * page_remove_rmap - take down pte mapping from a page
>   * @page:	page to remove mapping from
Ryan Roberts March 15, 2023, 4:08 p.m. UTC | #2
On 15/03/2023 13:34, Ryan Roberts wrote:
> On 15/03/2023 05:14, Matthew Wilcox (Oracle) wrote:
>> From: Yin Fengwei <fengwei.yin@intel.com>
>>
>> folio_add_file_rmap_range() allows to add pte mapping to a specific
>> range of file folio. Comparing to page_add_file_rmap(), it batched
>> updates __lruvec_stat for large folio.
>>
>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
>> ---
>>  include/linux/rmap.h |  2 ++
>>  mm/rmap.c            | 60 +++++++++++++++++++++++++++++++++-----------
>>  2 files changed, 48 insertions(+), 14 deletions(-)
>>
>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>> index b87d01660412..a3825ce81102 100644
>> --- a/include/linux/rmap.h
>> +++ b/include/linux/rmap.h
>> @@ -198,6 +198,8 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
>>  		unsigned long address);
>>  void page_add_file_rmap(struct page *, struct vm_area_struct *,
>>  		bool compound);
>> +void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
>> +		struct vm_area_struct *, bool compound);
>>  void page_remove_rmap(struct page *, struct vm_area_struct *,
>>  		bool compound);
>>  
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index 4898e10c569a..a91906b28835 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1301,31 +1301,39 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>>  }
>>  
>>  /**
>> - * page_add_file_rmap - add pte mapping to a file page
>> - * @page:	the page to add the mapping to
>> + * folio_add_file_rmap_range - add pte mapping to page range of a folio
>> + * @folio:	The folio to add the mapping to
>> + * @page:	The first page to add
>> + * @nr_pages:	The number of pages which will be mapped
>>   * @vma:	the vm area in which the mapping is added
>>   * @compound:	charge the page as compound or small page
>>   *
>> + * The page range of folio is defined by [first_page, first_page + nr_pages)
>> + *
>>   * The caller needs to hold the pte lock.
>>   */
>> -void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>> -		bool compound)
>> +void folio_add_file_rmap_range(struct folio *folio, struct page *page,
>> +			unsigned int nr_pages, struct vm_area_struct *vma,
>> +			bool compound)
>>  {
>> -	struct folio *folio = page_folio(page);
>>  	atomic_t *mapped = &folio->_nr_pages_mapped;
>> -	int nr = 0, nr_pmdmapped = 0;
>> -	bool first;
>> +	unsigned int nr_pmdmapped = 0, first;
>> +	int nr = 0;
>>  
>> -	VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
>> +	VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio);
>>  
>>  	/* Is page being mapped by PTE? Is this its first map to be added? */
>>  	if (likely(!compound)) {
>> -		first = atomic_inc_and_test(&page->_mapcount);
>> -		nr = first;
>> -		if (first && folio_test_large(folio)) {
>> -			nr = atomic_inc_return_relaxed(mapped);
>> -			nr = (nr < COMPOUND_MAPPED);
>> -		}
>> +		do {
>> +			first = atomic_inc_and_test(&page->_mapcount);
>> +			if (first && folio_test_large(folio)) {
>> +				first = atomic_inc_return_relaxed(mapped);
>> +				first = (nr < COMPOUND_MAPPED);
> 
> This still contains the typo that Yin Fengwei spotted in the previous version:
> https://lore.kernel.org/linux-mm/20230228213738.272178-1-willy@infradead.org/T/#m84673899e25bc31356093a1177941f2cc35e5da8
> 
> FYI, I'm seeing a perf regression of about 1% when compiling the kernel on
> Ampere Altra (arm64) with this whole series on top of v6.3-rc1 (In a VM using
> ext4 filesystem). Looks like instruction aborts are taking much longer and a
> selection of syscalls are a bit slower. Still hunting down the root cause. Will
> report once I have conclusive diagnosis.

I'm sorry - I'm struggling to find the exact cause. But its spending over 2x the
amount of time in the instruction abort handling code once patches 32-36 are
included. Everything in the flame graph is just taking longer. Perhaps we are
getting more instruction aborts somehow? I have the flamegraphs if anyone wants
them - just shout and I'll email them separately.

> 
> Thanks,
> Ryan
> 
> 
>> +			}
>> +
>> +			if (first)
>> +				nr++;
>> +		} while (page++, --nr_pages > 0);
>>  	} else if (folio_test_pmd_mappable(folio)) {
>>  		/* That test is redundant: it's for safety or to optimize out */
>>  
>> @@ -1354,6 +1362,30 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>  	mlock_vma_folio(folio, vma, compound);
>>  }
>>  
>> +/**
>> + * page_add_file_rmap - add pte mapping to a file page
>> + * @page:	the page to add the mapping to
>> + * @vma:	the vm area in which the mapping is added
>> + * @compound:	charge the page as compound or small page
>> + *
>> + * The caller needs to hold the pte lock.
>> + */
>> +void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>> +		bool compound)
>> +{
>> +	struct folio *folio = page_folio(page);
>> +	unsigned int nr_pages;
>> +
>> +	VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page);
>> +
>> +	if (likely(!compound))
>> +		nr_pages = 1;
>> +	else
>> +		nr_pages = folio_nr_pages(folio);
>> +
>> +	folio_add_file_rmap_range(folio, page, nr_pages, vma, compound);
>> +}
>> +
>>  /**
>>   * page_remove_rmap - take down pte mapping from a page
>>   * @page:	page to remove mapping from
>
Yin Fengwei March 15, 2023, 10:58 p.m. UTC | #3
On 2023/3/16 0:08, Ryan Roberts wrote:
> On 15/03/2023 13:34, Ryan Roberts wrote:
>> On 15/03/2023 05:14, Matthew Wilcox (Oracle) wrote:
>>> From: Yin Fengwei <fengwei.yin@intel.com>
>>>
>>> folio_add_file_rmap_range() allows to add pte mapping to a specific
>>> range of file folio. Comparing to page_add_file_rmap(), it batched
>>> updates __lruvec_stat for large folio.
>>>
>>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>>> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
>>> ---
>>>   include/linux/rmap.h |  2 ++
>>>   mm/rmap.c            | 60 +++++++++++++++++++++++++++++++++-----------
>>>   2 files changed, 48 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>> index b87d01660412..a3825ce81102 100644
>>> --- a/include/linux/rmap.h
>>> +++ b/include/linux/rmap.h
>>> @@ -198,6 +198,8 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
>>>   		unsigned long address);
>>>   void page_add_file_rmap(struct page *, struct vm_area_struct *,
>>>   		bool compound);
>>> +void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
>>> +		struct vm_area_struct *, bool compound);
>>>   void page_remove_rmap(struct page *, struct vm_area_struct *,
>>>   		bool compound);
>>>   
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index 4898e10c569a..a91906b28835 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -1301,31 +1301,39 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>>>   }
>>>   
>>>   /**
>>> - * page_add_file_rmap - add pte mapping to a file page
>>> - * @page:	the page to add the mapping to
>>> + * folio_add_file_rmap_range - add pte mapping to page range of a folio
>>> + * @folio:	The folio to add the mapping to
>>> + * @page:	The first page to add
>>> + * @nr_pages:	The number of pages which will be mapped
>>>    * @vma:	the vm area in which the mapping is added
>>>    * @compound:	charge the page as compound or small page
>>>    *
>>> + * The page range of folio is defined by [first_page, first_page + nr_pages)
>>> + *
>>>    * The caller needs to hold the pte lock.
>>>    */
>>> -void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>> -		bool compound)
>>> +void folio_add_file_rmap_range(struct folio *folio, struct page *page,
>>> +			unsigned int nr_pages, struct vm_area_struct *vma,
>>> +			bool compound)
>>>   {
>>> -	struct folio *folio = page_folio(page);
>>>   	atomic_t *mapped = &folio->_nr_pages_mapped;
>>> -	int nr = 0, nr_pmdmapped = 0;
>>> -	bool first;
>>> +	unsigned int nr_pmdmapped = 0, first;
>>> +	int nr = 0;
>>>   
>>> -	VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
>>> +	VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio);
>>>   
>>>   	/* Is page being mapped by PTE? Is this its first map to be added? */
>>>   	if (likely(!compound)) {
>>> -		first = atomic_inc_and_test(&page->_mapcount);
>>> -		nr = first;
>>> -		if (first && folio_test_large(folio)) {
>>> -			nr = atomic_inc_return_relaxed(mapped);
>>> -			nr = (nr < COMPOUND_MAPPED);
>>> -		}
>>> +		do {
>>> +			first = atomic_inc_and_test(&page->_mapcount);
>>> +			if (first && folio_test_large(folio)) {
>>> +				first = atomic_inc_return_relaxed(mapped);
>>> +				first = (nr < COMPOUND_MAPPED);
>>
>> This still contains the typo that Yin Fengwei spotted in the previous version:
>> https://lore.kernel.org/linux-mm/20230228213738.272178-1-willy@infradead.org/T/#m84673899e25bc31356093a1177941f2cc35e5da8
>>
>> FYI, I'm seeing a perf regression of about 1% when compiling the kernel on
>> Ampere Altra (arm64) with this whole series on top of v6.3-rc1 (In a VM using
>> ext4 filesystem). Looks like instruction aborts are taking much longer and a
>> selection of syscalls are a bit slower. Still hunting down the root cause. Will
>> report once I have conclusive diagnosis.
> 
> I'm sorry - I'm struggling to find the exact cause. But its spending over 2x the
> amount of time in the instruction abort handling code once patches 32-36 are
> included. Everything in the flame graph is just taking longer. Perhaps we are
> getting more instruction aborts somehow? I have the flamegraphs if anyone wants
> them - just shout and I'll email them separately.
Sorry for using another email. I am on travel and can't access my
company email now. Can you share the flamegraphs to me? I'd like
to take a look. As I remember, I didn't see the kernel build
regression w/o these patches on ext4/xfs. Thanks.


Regards
Yin, Fengwei

> 
>>
>> Thanks,
>> Ryan
>>
>>
>>> +			}
>>> +
>>> +			if (first)
>>> +				nr++;
>>> +		} while (page++, --nr_pages > 0);
>>>   	} else if (folio_test_pmd_mappable(folio)) {
>>>   		/* That test is redundant: it's for safety or to optimize out */
>>>   
>>> @@ -1354,6 +1362,30 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>>   	mlock_vma_folio(folio, vma, compound);
>>>   }
>>>   
>>> +/**
>>> + * page_add_file_rmap - add pte mapping to a file page
>>> + * @page:	the page to add the mapping to
>>> + * @vma:	the vm area in which the mapping is added
>>> + * @compound:	charge the page as compound or small page
>>> + *
>>> + * The caller needs to hold the pte lock.
>>> + */
>>> +void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>> +		bool compound)
>>> +{
>>> +	struct folio *folio = page_folio(page);
>>> +	unsigned int nr_pages;
>>> +

>>> +	VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page);
>>> +
>>> +	if (likely(!compound))
>>> +		nr_pages = 1;
>>> +	else
>>> +		nr_pages = folio_nr_pages(folio);
>>> +
>>> +	folio_add_file_rmap_range(folio, page, nr_pages, vma, compound);
>>> +}
>>> +
>>>   /**
>>>    * page_remove_rmap - take down pte mapping from a page
>>>    * @page:	page to remove mapping from
>>
>
Yin, Fengwei March 16, 2023, 4:27 p.m. UTC | #4
Hi Matthew,

On 3/16/2023 12:08 AM, Ryan Roberts wrote:
> On 15/03/2023 13:34, Ryan Roberts wrote:
>> On 15/03/2023 05:14, Matthew Wilcox (Oracle) wrote:
>>> From: Yin Fengwei <fengwei.yin@intel.com>
>>>
>>> folio_add_file_rmap_range() allows to add pte mapping to a specific
>>> range of file folio. Comparing to page_add_file_rmap(), it batched
>>> updates __lruvec_stat for large folio.
>>>
>>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>>> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
>>> ---
>>>  include/linux/rmap.h |  2 ++
>>>  mm/rmap.c            | 60 +++++++++++++++++++++++++++++++++-----------
>>>  2 files changed, 48 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>> index b87d01660412..a3825ce81102 100644
>>> --- a/include/linux/rmap.h
>>> +++ b/include/linux/rmap.h
>>> @@ -198,6 +198,8 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
>>>  		unsigned long address);
>>>  void page_add_file_rmap(struct page *, struct vm_area_struct *,
>>>  		bool compound);
>>> +void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
>>> +		struct vm_area_struct *, bool compound);
>>>  void page_remove_rmap(struct page *, struct vm_area_struct *,
>>>  		bool compound);
>>>  
>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>> index 4898e10c569a..a91906b28835 100644
>>> --- a/mm/rmap.c
>>> +++ b/mm/rmap.c
>>> @@ -1301,31 +1301,39 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>>>  }
>>>  
>>>  /**
>>> - * page_add_file_rmap - add pte mapping to a file page
>>> - * @page:	the page to add the mapping to
>>> + * folio_add_file_rmap_range - add pte mapping to page range of a folio
>>> + * @folio:	The folio to add the mapping to
>>> + * @page:	The first page to add
>>> + * @nr_pages:	The number of pages which will be mapped
>>>   * @vma:	the vm area in which the mapping is added
>>>   * @compound:	charge the page as compound or small page
>>>   *
>>> + * The page range of folio is defined by [first_page, first_page + nr_pages)
>>> + *
>>>   * The caller needs to hold the pte lock.
>>>   */
>>> -void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>> -		bool compound)
>>> +void folio_add_file_rmap_range(struct folio *folio, struct page *page,
>>> +			unsigned int nr_pages, struct vm_area_struct *vma,
>>> +			bool compound)
>>>  {
>>> -	struct folio *folio = page_folio(page);
>>>  	atomic_t *mapped = &folio->_nr_pages_mapped;
>>> -	int nr = 0, nr_pmdmapped = 0;
>>> -	bool first;
>>> +	unsigned int nr_pmdmapped = 0, first;
>>> +	int nr = 0;
>>>  
>>> -	VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
>>> +	VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio);
>>>  
>>>  	/* Is page being mapped by PTE? Is this its first map to be added? */
>>>  	if (likely(!compound)) {
>>> -		first = atomic_inc_and_test(&page->_mapcount);
>>> -		nr = first;
>>> -		if (first && folio_test_large(folio)) {
>>> -			nr = atomic_inc_return_relaxed(mapped);
>>> -			nr = (nr < COMPOUND_MAPPED);
>>> -		}
>>> +		do {
>>> +			first = atomic_inc_and_test(&page->_mapcount);
>>> +			if (first && folio_test_large(folio)) {
>>> +				first = atomic_inc_return_relaxed(mapped);
>>> +				first = (nr < COMPOUND_MAPPED);
>>
>> This still contains the typo that Yin Fengwei spotted in the previous version:
>> https://lore.kernel.org/linux-mm/20230228213738.272178-1-willy@infradead.org/T/#m84673899e25bc31356093a1177941f2cc35e5da8
>>
>> FYI, I'm seeing a perf regression of about 1% when compiling the kernel on
>> Ampere Altra (arm64) with this whole series on top of v6.3-rc1 (In a VM using
>> ext4 filesystem). Looks like instruction aborts are taking much longer and a
>> selection of syscalls are a bit slower. Still hunting down the root cause. Will
>> report once I have conclusive diagnosis.
> 
> I'm sorry - I'm struggling to find the exact cause. But its spending over 2x the
> amount of time in the instruction abort handling code once patches 32-36 are
> included. Everything in the flame graph is just taking longer. Perhaps we are
> getting more instruction aborts somehow? I have the flamegraphs if anyone wants
> them - just shout and I'll email them separately.
Thanks a lot to Ryan for sharing the flamegraphs to me. I found the __do_fault()
is called with patch 32-36 while no __do_fault() just with first 31 patches. I 
suspect the folio_add_file_rmap_range() missed some PTEs population. Please give
me few days to find the root cause and fix. Sorry for this.


Regards
Yin, Fengwei

> 
>>
>> Thanks,
>> Ryan
>>
>>
>>> +			}
>>> +
>>> +			if (first)
>>> +				nr++;
>>> +		} while (page++, --nr_pages > 0);
>>>  	} else if (folio_test_pmd_mappable(folio)) {
>>>  		/* That test is redundant: it's for safety or to optimize out */
>>>  
>>> @@ -1354,6 +1362,30 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>>  	mlock_vma_folio(folio, vma, compound);
>>>  }
>>>  
>>> +/**
>>> + * page_add_file_rmap - add pte mapping to a file page
>>> + * @page:	the page to add the mapping to
>>> + * @vma:	the vm area in which the mapping is added
>>> + * @compound:	charge the page as compound or small page
>>> + *
>>> + * The caller needs to hold the pte lock.
>>> + */
>>> +void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>> +		bool compound)
>>> +{
>>> +	struct folio *folio = page_folio(page);
>>> +	unsigned int nr_pages;
>>> +
>>> +	VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page);
>>> +
>>> +	if (likely(!compound))
>>> +		nr_pages = 1;
>>> +	else
>>> +		nr_pages = folio_nr_pages(folio);
>>> +
>>> +	folio_add_file_rmap_range(folio, page, nr_pages, vma, compound);
>>> +}
>>> +
>>>  /**
>>>   * page_remove_rmap - take down pte mapping from a page
>>>   * @page:	page to remove mapping from
>>
>
Ryan Roberts March 16, 2023, 4:34 p.m. UTC | #5
On 16/03/2023 16:27, Yin, Fengwei wrote:
> Hi Matthew,
> 
> On 3/16/2023 12:08 AM, Ryan Roberts wrote:
>> On 15/03/2023 13:34, Ryan Roberts wrote:
>>> On 15/03/2023 05:14, Matthew Wilcox (Oracle) wrote:
>>>> From: Yin Fengwei <fengwei.yin@intel.com>
>>>>
>>>> folio_add_file_rmap_range() allows to add pte mapping to a specific
>>>> range of file folio. Comparing to page_add_file_rmap(), it batched
>>>> updates __lruvec_stat for large folio.
>>>>
>>>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>>>> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
>>>> ---
>>>>  include/linux/rmap.h |  2 ++
>>>>  mm/rmap.c            | 60 +++++++++++++++++++++++++++++++++-----------
>>>>  2 files changed, 48 insertions(+), 14 deletions(-)
>>>>
>>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>>> index b87d01660412..a3825ce81102 100644
>>>> --- a/include/linux/rmap.h
>>>> +++ b/include/linux/rmap.h
>>>> @@ -198,6 +198,8 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
>>>>  		unsigned long address);
>>>>  void page_add_file_rmap(struct page *, struct vm_area_struct *,
>>>>  		bool compound);
>>>> +void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
>>>> +		struct vm_area_struct *, bool compound);
>>>>  void page_remove_rmap(struct page *, struct vm_area_struct *,
>>>>  		bool compound);
>>>>  
>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>> index 4898e10c569a..a91906b28835 100644
>>>> --- a/mm/rmap.c
>>>> +++ b/mm/rmap.c
>>>> @@ -1301,31 +1301,39 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>>>>  }
>>>>  
>>>>  /**
>>>> - * page_add_file_rmap - add pte mapping to a file page
>>>> - * @page:	the page to add the mapping to
>>>> + * folio_add_file_rmap_range - add pte mapping to page range of a folio
>>>> + * @folio:	The folio to add the mapping to
>>>> + * @page:	The first page to add
>>>> + * @nr_pages:	The number of pages which will be mapped
>>>>   * @vma:	the vm area in which the mapping is added
>>>>   * @compound:	charge the page as compound or small page
>>>>   *
>>>> + * The page range of folio is defined by [first_page, first_page + nr_pages)
>>>> + *
>>>>   * The caller needs to hold the pte lock.
>>>>   */
>>>> -void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>>> -		bool compound)
>>>> +void folio_add_file_rmap_range(struct folio *folio, struct page *page,
>>>> +			unsigned int nr_pages, struct vm_area_struct *vma,
>>>> +			bool compound)
>>>>  {
>>>> -	struct folio *folio = page_folio(page);
>>>>  	atomic_t *mapped = &folio->_nr_pages_mapped;
>>>> -	int nr = 0, nr_pmdmapped = 0;
>>>> -	bool first;
>>>> +	unsigned int nr_pmdmapped = 0, first;
>>>> +	int nr = 0;
>>>>  
>>>> -	VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
>>>> +	VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio);
>>>>  
>>>>  	/* Is page being mapped by PTE? Is this its first map to be added? */
>>>>  	if (likely(!compound)) {
>>>> -		first = atomic_inc_and_test(&page->_mapcount);
>>>> -		nr = first;
>>>> -		if (first && folio_test_large(folio)) {
>>>> -			nr = atomic_inc_return_relaxed(mapped);
>>>> -			nr = (nr < COMPOUND_MAPPED);
>>>> -		}
>>>> +		do {
>>>> +			first = atomic_inc_and_test(&page->_mapcount);
>>>> +			if (first && folio_test_large(folio)) {
>>>> +				first = atomic_inc_return_relaxed(mapped);
>>>> +				first = (nr < COMPOUND_MAPPED);
>>>
>>> This still contains the typo that Yin Fengwei spotted in the previous version:
>>> https://lore.kernel.org/linux-mm/20230228213738.272178-1-willy@infradead.org/T/#m84673899e25bc31356093a1177941f2cc35e5da8
>>>
>>> FYI, I'm seeing a perf regression of about 1% when compiling the kernel on
>>> Ampere Altra (arm64) with this whole series on top of v6.3-rc1 (In a VM using
>>> ext4 filesystem). Looks like instruction aborts are taking much longer and a
>>> selection of syscalls are a bit slower. Still hunting down the root cause. Will
>>> report once I have conclusive diagnosis.
>>
>> I'm sorry - I'm struggling to find the exact cause. But its spending over 2x the
>> amount of time in the instruction abort handling code once patches 32-36 are
>> included. Everything in the flame graph is just taking longer. Perhaps we are
>> getting more instruction aborts somehow? I have the flamegraphs if anyone wants
>> them - just shout and I'll email them separately.
> Thanks a lot to Ryan for sharing the flamegraphs to me. I found the __do_fault()
> is called with patch 32-36 while no __do_fault() just with first 31 patches. I 
> suspect the folio_add_file_rmap_range() missed some PTEs population. Please give
> me few days to find the root cause and fix. Sorry for this.

You're welcome. Give me a shout once you have a re-spin and I'll rerun the tests.

> 
> 
> Regards
> Yin, Fengwei
> 
>>
>>>
>>> Thanks,
>>> Ryan
>>>
>>>
>>>> +			}
>>>> +
>>>> +			if (first)
>>>> +				nr++;
>>>> +		} while (page++, --nr_pages > 0);
>>>>  	} else if (folio_test_pmd_mappable(folio)) {
>>>>  		/* That test is redundant: it's for safety or to optimize out */
>>>>  
>>>> @@ -1354,6 +1362,30 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>>>  	mlock_vma_folio(folio, vma, compound);
>>>>  }
>>>>  
>>>> +/**
>>>> + * page_add_file_rmap - add pte mapping to a file page
>>>> + * @page:	the page to add the mapping to
>>>> + * @vma:	the vm area in which the mapping is added
>>>> + * @compound:	charge the page as compound or small page
>>>> + *
>>>> + * The caller needs to hold the pte lock.
>>>> + */
>>>> +void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>>> +		bool compound)
>>>> +{
>>>> +	struct folio *folio = page_folio(page);
>>>> +	unsigned int nr_pages;
>>>> +
>>>> +	VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page);
>>>> +
>>>> +	if (likely(!compound))
>>>> +		nr_pages = 1;
>>>> +	else
>>>> +		nr_pages = folio_nr_pages(folio);
>>>> +
>>>> +	folio_add_file_rmap_range(folio, page, nr_pages, vma, compound);
>>>> +}
>>>> +
>>>>  /**
>>>>   * page_remove_rmap - take down pte mapping from a page
>>>>   * @page:	page to remove mapping from
>>>
>>
Yin, Fengwei March 17, 2023, 8:23 a.m. UTC | #6
Hi Ryan,

On 3/17/2023 12:34 AM, Ryan Roberts wrote:
> On 16/03/2023 16:27, Yin, Fengwei wrote:
>> Hi Matthew,
>>
>> On 3/16/2023 12:08 AM, Ryan Roberts wrote:
>>> On 15/03/2023 13:34, Ryan Roberts wrote:
>>>> On 15/03/2023 05:14, Matthew Wilcox (Oracle) wrote:
>>>>> From: Yin Fengwei <fengwei.yin@intel.com>
>>>>>
>>>>> folio_add_file_rmap_range() allows to add pte mapping to a specific
>>>>> range of file folio. Comparing to page_add_file_rmap(), it batched
>>>>> updates __lruvec_stat for large folio.
>>>>>
>>>>> Signed-off-by: Yin Fengwei <fengwei.yin@intel.com>
>>>>> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
>>>>> ---
>>>>>  include/linux/rmap.h |  2 ++
>>>>>  mm/rmap.c            | 60 +++++++++++++++++++++++++++++++++-----------
>>>>>  2 files changed, 48 insertions(+), 14 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>>>>> index b87d01660412..a3825ce81102 100644
>>>>> --- a/include/linux/rmap.h
>>>>> +++ b/include/linux/rmap.h
>>>>> @@ -198,6 +198,8 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
>>>>>  		unsigned long address);
>>>>>  void page_add_file_rmap(struct page *, struct vm_area_struct *,
>>>>>  		bool compound);
>>>>> +void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
>>>>> +		struct vm_area_struct *, bool compound);
>>>>>  void page_remove_rmap(struct page *, struct vm_area_struct *,
>>>>>  		bool compound);
>>>>>  
>>>>> diff --git a/mm/rmap.c b/mm/rmap.c
>>>>> index 4898e10c569a..a91906b28835 100644
>>>>> --- a/mm/rmap.c
>>>>> +++ b/mm/rmap.c
>>>>> @@ -1301,31 +1301,39 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>>>>>  }
>>>>>  
>>>>>  /**
>>>>> - * page_add_file_rmap - add pte mapping to a file page
>>>>> - * @page:	the page to add the mapping to
>>>>> + * folio_add_file_rmap_range - add pte mapping to page range of a folio
>>>>> + * @folio:	The folio to add the mapping to
>>>>> + * @page:	The first page to add
>>>>> + * @nr_pages:	The number of pages which will be mapped
>>>>>   * @vma:	the vm area in which the mapping is added
>>>>>   * @compound:	charge the page as compound or small page
>>>>>   *
>>>>> + * The page range of folio is defined by [first_page, first_page + nr_pages)
>>>>> + *
>>>>>   * The caller needs to hold the pte lock.
>>>>>   */
>>>>> -void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>>>> -		bool compound)
>>>>> +void folio_add_file_rmap_range(struct folio *folio, struct page *page,
>>>>> +			unsigned int nr_pages, struct vm_area_struct *vma,
>>>>> +			bool compound)
>>>>>  {
>>>>> -	struct folio *folio = page_folio(page);
>>>>>  	atomic_t *mapped = &folio->_nr_pages_mapped;
>>>>> -	int nr = 0, nr_pmdmapped = 0;
>>>>> -	bool first;
>>>>> +	unsigned int nr_pmdmapped = 0, first;
>>>>> +	int nr = 0;
>>>>>  
>>>>> -	VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
>>>>> +	VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio);
>>>>>  
>>>>>  	/* Is page being mapped by PTE? Is this its first map to be added? */
>>>>>  	if (likely(!compound)) {
>>>>> -		first = atomic_inc_and_test(&page->_mapcount);
>>>>> -		nr = first;
>>>>> -		if (first && folio_test_large(folio)) {
>>>>> -			nr = atomic_inc_return_relaxed(mapped);
>>>>> -			nr = (nr < COMPOUND_MAPPED);
>>>>> -		}
>>>>> +		do {
>>>>> +			first = atomic_inc_and_test(&page->_mapcount);
>>>>> +			if (first && folio_test_large(folio)) {
>>>>> +				first = atomic_inc_return_relaxed(mapped);
>>>>> +				first = (nr < COMPOUND_MAPPED);
>>>>
>>>> This still contains the typo that Yin Fengwei spotted in the previous version:
>>>> https://lore.kernel.org/linux-mm/20230228213738.272178-1-willy@infradead.org/T/#m84673899e25bc31356093a1177941f2cc35e5da8
>>>>
>>>> FYI, I'm seeing a perf regression of about 1% when compiling the kernel on
>>>> Ampere Altra (arm64) with this whole series on top of v6.3-rc1 (In a VM using
>>>> ext4 filesystem). Looks like instruction aborts are taking much longer and a
>>>> selection of syscalls are a bit slower. Still hunting down the root cause. Will
>>>> report once I have conclusive diagnosis.
>>>
>>> I'm sorry - I'm struggling to find the exact cause. But its spending over 2x the
>>> amount of time in the instruction abort handling code once patches 32-36 are
>>> included. Everything in the flame graph is just taking longer. Perhaps we are
>>> getting more instruction aborts somehow? I have the flamegraphs if anyone wants
>>> them - just shout and I'll email them separately.
>> Thanks a lot to Ryan for sharing the flamegraphs to me. I found the __do_fault()
>> is called with patch 32-36 while no __do_fault() just with first 31 patches. I 
>> suspect the folio_add_file_rmap_range() missed some PTEs population. Please give
>> me few days to find the root cause and fix. Sorry for this.
> 
> You're welcome. Give me a shout once you have a re-spin and I'll rerun the tests.
Could you please help to try following changes? Thanks in advance.

diff --git a/mm/filemap.c b/mm/filemap.c
index 40be33b5ee46..137011320c68 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3504,15 +3504,16 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
 		if (!pte_none(vmf->pte[count]))
 			goto skip;
 
-		if (vmf->address == addr)
-			ret = VM_FAULT_NOPAGE;
-
 		count++;
 		continue;
 skip:
 		if (count) {
 			set_pte_range(vmf, folio, page, count, addr);
 			folio_ref_add(folio, count);
+			if ((vmf->address < (addr + count * PAGE_SIZE)) &&
+					(vmf->address >= addr))
+				ret = VM_FAULT_NOPAGE;
+
 		}
 
 		count++;
@@ -3525,6 +3526,9 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
 	if (count) {
 		set_pte_range(vmf, folio, page, count, addr);
 		folio_ref_add(folio, count);
+		if ((vmf->address < (addr + count * PAGE_SIZE)) &&
+				(vmf->address >= addr))
+			ret = VM_FAULT_NOPAGE;
 	}
 
 	vmf->pte = old_ptep;


Regards
Yin, Fengwei

> 
>>
>>
>> Regards
>> Yin, Fengwei
>>
>>>
>>>>
>>>> Thanks,
>>>> Ryan
>>>>
>>>>
>>>>> +			}
>>>>> +
>>>>> +			if (first)
>>>>> +				nr++;
>>>>> +		} while (page++, --nr_pages > 0);
>>>>>  	} else if (folio_test_pmd_mappable(folio)) {
>>>>>  		/* That test is redundant: it's for safety or to optimize out */
>>>>>  
>>>>> @@ -1354,6 +1362,30 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>>>>  	mlock_vma_folio(folio, vma, compound);
>>>>>  }
>>>>>  
>>>>> +/**
>>>>> + * page_add_file_rmap - add pte mapping to a file page
>>>>> + * @page:	the page to add the mapping to
>>>>> + * @vma:	the vm area in which the mapping is added
>>>>> + * @compound:	charge the page as compound or small page
>>>>> + *
>>>>> + * The caller needs to hold the pte lock.
>>>>> + */
>>>>> +void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>>>> +		bool compound)
>>>>> +{
>>>>> +	struct folio *folio = page_folio(page);
>>>>> +	unsigned int nr_pages;
>>>>> +
>>>>> +	VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page);
>>>>> +
>>>>> +	if (likely(!compound))
>>>>> +		nr_pages = 1;
>>>>> +	else
>>>>> +		nr_pages = folio_nr_pages(folio);
>>>>> +
>>>>> +	folio_add_file_rmap_range(folio, page, nr_pages, vma, compound);
>>>>> +}
>>>>> +
>>>>>  /**
>>>>>   * page_remove_rmap - take down pte mapping from a page
>>>>>   * @page:	page to remove mapping from
>>>>
>>>
> 
>
Ryan Roberts March 17, 2023, 12:46 p.m. UTC | #7
On 17/03/2023 08:23, Yin, Fengwei wrote:
[...]

>>>>> FYI, I'm seeing a perf regression of about 1% when compiling the kernel on
>>>>> Ampere Altra (arm64) with this whole series on top of v6.3-rc1 (In a VM using
>>>>> ext4 filesystem). Looks like instruction aborts are taking much longer and a
>>>>> selection of syscalls are a bit slower. Still hunting down the root cause. Will
>>>>> report once I have conclusive diagnosis.
>>>>
>>>> I'm sorry - I'm struggling to find the exact cause. But its spending over 2x the
>>>> amount of time in the instruction abort handling code once patches 32-36 are
>>>> included. Everything in the flame graph is just taking longer. Perhaps we are
>>>> getting more instruction aborts somehow? I have the flamegraphs if anyone wants
>>>> them - just shout and I'll email them separately.
>>> Thanks a lot to Ryan for sharing the flamegraphs to me. I found the __do_fault()
>>> is called with patch 32-36 while no __do_fault() just with first 31 patches. I 
>>> suspect the folio_add_file_rmap_range() missed some PTEs population. Please give
>>> me few days to find the root cause and fix. Sorry for this.
>>
>> You're welcome. Give me a shout once you have a re-spin and I'll rerun the tests.
> Could you please help to try following changes? Thanks in advance.
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 40be33b5ee46..137011320c68 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3504,15 +3504,16 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>  		if (!pte_none(vmf->pte[count]))
>  			goto skip;
>  
> -		if (vmf->address == addr)
> -			ret = VM_FAULT_NOPAGE;
> -
>  		count++;
>  		continue;
>  skip:
>  		if (count) {
>  			set_pte_range(vmf, folio, page, count, addr);
>  			folio_ref_add(folio, count);
> +			if ((vmf->address < (addr + count * PAGE_SIZE)) &&
> +					(vmf->address >= addr))
> +				ret = VM_FAULT_NOPAGE;
> +
>  		}
>  
>  		count++;
> @@ -3525,6 +3526,9 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>  	if (count) {
>  		set_pte_range(vmf, folio, page, count, addr);
>  		folio_ref_add(folio, count);
> +		if ((vmf->address < (addr + count * PAGE_SIZE)) &&
> +				(vmf->address >= addr))
> +			ret = VM_FAULT_NOPAGE;
>  	}
>  
>  	vmf->pte = old_ptep;
> 

I'm afraid this hasn't fixed it, and I still see __do_fault(). I'll send the
flame graph over separately.

Given I'm running on ext4, I wasn't expecting to see any large page cache
folios? So I don't think we would have expected this patch to help anyway? (or
perhaps there are still THP folios? But I think they will get PMD mapped?).


> 
> Regards
> Yin, Fengwei
> 
>>
>>>
>>>
>>> Regards
>>> Yin, Fengwei
>>>
>>>>
>>>>>
>>>>> Thanks,
>>>>> Ryan
>>>>>
>>>>>
>>>>>> +			}
>>>>>> +
>>>>>> +			if (first)
>>>>>> +				nr++;
>>>>>> +		} while (page++, --nr_pages > 0);
>>>>>>  	} else if (folio_test_pmd_mappable(folio)) {
>>>>>>  		/* That test is redundant: it's for safety or to optimize out */
>>>>>>  
>>>>>> @@ -1354,6 +1362,30 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>>>>>  	mlock_vma_folio(folio, vma, compound);
>>>>>>  }
>>>>>>  
>>>>>> +/**
>>>>>> + * page_add_file_rmap - add pte mapping to a file page
>>>>>> + * @page:	the page to add the mapping to
>>>>>> + * @vma:	the vm area in which the mapping is added
>>>>>> + * @compound:	charge the page as compound or small page
>>>>>> + *
>>>>>> + * The caller needs to hold the pte lock.
>>>>>> + */
>>>>>> +void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>>>>> +		bool compound)
>>>>>> +{
>>>>>> +	struct folio *folio = page_folio(page);
>>>>>> +	unsigned int nr_pages;
>>>>>> +
>>>>>> +	VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page);
>>>>>> +
>>>>>> +	if (likely(!compound))
>>>>>> +		nr_pages = 1;
>>>>>> +	else
>>>>>> +		nr_pages = folio_nr_pages(folio);
>>>>>> +
>>>>>> +	folio_add_file_rmap_range(folio, page, nr_pages, vma, compound);
>>>>>> +}
>>>>>> +
>>>>>>  /**
>>>>>>   * page_remove_rmap - take down pte mapping from a page
>>>>>>   * @page:	page to remove mapping from
>>>>>
>>>>
>>
>>
Yin, Fengwei March 17, 2023, 1:28 p.m. UTC | #8
On 3/17/2023 8:46 PM, Ryan Roberts wrote:
> On 17/03/2023 08:23, Yin, Fengwei wrote:
> [...]
> 
>>>>>> FYI, I'm seeing a perf regression of about 1% when compiling the kernel on
>>>>>> Ampere Altra (arm64) with this whole series on top of v6.3-rc1 (In a VM using
>>>>>> ext4 filesystem). Looks like instruction aborts are taking much longer and a
>>>>>> selection of syscalls are a bit slower. Still hunting down the root cause. Will
>>>>>> report once I have conclusive diagnosis.
>>>>>
>>>>> I'm sorry - I'm struggling to find the exact cause. But its spending over 2x the
>>>>> amount of time in the instruction abort handling code once patches 32-36 are
>>>>> included. Everything in the flame graph is just taking longer. Perhaps we are
>>>>> getting more instruction aborts somehow? I have the flamegraphs if anyone wants
>>>>> them - just shout and I'll email them separately.
>>>> Thanks a lot to Ryan for sharing the flamegraphs to me. I found the __do_fault()
>>>> is called with patch 32-36 while no __do_fault() just with first 31 patches. I 
>>>> suspect the folio_add_file_rmap_range() missed some PTEs population. Please give
>>>> me few days to find the root cause and fix. Sorry for this.
>>>
>>> You're welcome. Give me a shout once you have a re-spin and I'll rerun the tests.
>> Could you please help to try following changes? Thanks in advance.
>>
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index 40be33b5ee46..137011320c68 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -3504,15 +3504,16 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>>  		if (!pte_none(vmf->pte[count]))
>>  			goto skip;
>>  
>> -		if (vmf->address == addr)
>> -			ret = VM_FAULT_NOPAGE;
>> -
>>  		count++;
>>  		continue;
>>  skip:
>>  		if (count) {
>>  			set_pte_range(vmf, folio, page, count, addr);
>>  			folio_ref_add(folio, count);
>> +			if ((vmf->address < (addr + count * PAGE_SIZE)) &&
>> +					(vmf->address >= addr))
>> +				ret = VM_FAULT_NOPAGE;
>> +
>>  		}
>>  
>>  		count++;
>> @@ -3525,6 +3526,9 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
>>  	if (count) {
>>  		set_pte_range(vmf, folio, page, count, addr);
>>  		folio_ref_add(folio, count);
>> +		if ((vmf->address < (addr + count * PAGE_SIZE)) &&
>> +				(vmf->address >= addr))
>> +			ret = VM_FAULT_NOPAGE;
>>  	}
>>  
>>  	vmf->pte = old_ptep;
>>
> 
> I'm afraid this hasn't fixed it, and I still see __do_fault(). I'll send the
> flame graph over separately.
> 
> Given I'm running on ext4, I wasn't expecting to see any large page cache
> folios? So I don't think we would have expected this patch to help anyway? (or
> perhaps there are still THP folios? But I think they will get PMD mapped?).
OK. I will try to reproduce the issue on my local env to see whether I could
reproduce it on x86_64 env.


Regards
Yin, Fengwei

> 
> 
>>
>> Regards
>> Yin, Fengwei
>>
>>>
>>>>
>>>>
>>>> Regards
>>>> Yin, Fengwei
>>>>
>>>>>
>>>>>>
>>>>>> Thanks,
>>>>>> Ryan
>>>>>>
>>>>>>
>>>>>>> +			}
>>>>>>> +
>>>>>>> +			if (first)
>>>>>>> +				nr++;
>>>>>>> +		} while (page++, --nr_pages > 0);
>>>>>>>  	} else if (folio_test_pmd_mappable(folio)) {
>>>>>>>  		/* That test is redundant: it's for safety or to optimize out */
>>>>>>>  
>>>>>>> @@ -1354,6 +1362,30 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>>>>>>  	mlock_vma_folio(folio, vma, compound);
>>>>>>>  }
>>>>>>>  
>>>>>>> +/**
>>>>>>> + * page_add_file_rmap - add pte mapping to a file page
>>>>>>> + * @page:	the page to add the mapping to
>>>>>>> + * @vma:	the vm area in which the mapping is added
>>>>>>> + * @compound:	charge the page as compound or small page
>>>>>>> + *
>>>>>>> + * The caller needs to hold the pte lock.
>>>>>>> + */
>>>>>>> +void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>>>>>>> +		bool compound)
>>>>>>> +{
>>>>>>> +	struct folio *folio = page_folio(page);
>>>>>>> +	unsigned int nr_pages;
>>>>>>> +
>>>>>>> +	VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page);
>>>>>>> +
>>>>>>> +	if (likely(!compound))
>>>>>>> +		nr_pages = 1;
>>>>>>> +	else
>>>>>>> +		nr_pages = folio_nr_pages(folio);
>>>>>>> +
>>>>>>> +	folio_add_file_rmap_range(folio, page, nr_pages, vma, compound);
>>>>>>> +}
>>>>>>> +
>>>>>>>  /**
>>>>>>>   * page_remove_rmap - take down pte mapping from a page
>>>>>>>   * @page:	page to remove mapping from
>>>>>>
>>>>>
>>>
>>>
>
diff mbox series

Patch

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b87d01660412..a3825ce81102 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -198,6 +198,8 @@  void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 void page_add_file_rmap(struct page *, struct vm_area_struct *,
 		bool compound);
+void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
+		struct vm_area_struct *, bool compound);
 void page_remove_rmap(struct page *, struct vm_area_struct *,
 		bool compound);
 
diff --git a/mm/rmap.c b/mm/rmap.c
index 4898e10c569a..a91906b28835 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1301,31 +1301,39 @@  void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
 }
 
 /**
- * page_add_file_rmap - add pte mapping to a file page
- * @page:	the page to add the mapping to
+ * folio_add_file_rmap_range - add pte mapping to page range of a folio
+ * @folio:	The folio to add the mapping to
+ * @page:	The first page to add
+ * @nr_pages:	The number of pages which will be mapped
  * @vma:	the vm area in which the mapping is added
  * @compound:	charge the page as compound or small page
  *
+ * The page range of folio is defined by [first_page, first_page + nr_pages)
+ *
  * The caller needs to hold the pte lock.
  */
-void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
-		bool compound)
+void folio_add_file_rmap_range(struct folio *folio, struct page *page,
+			unsigned int nr_pages, struct vm_area_struct *vma,
+			bool compound)
 {
-	struct folio *folio = page_folio(page);
 	atomic_t *mapped = &folio->_nr_pages_mapped;
-	int nr = 0, nr_pmdmapped = 0;
-	bool first;
+	unsigned int nr_pmdmapped = 0, first;
+	int nr = 0;
 
-	VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
+	VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio);
 
 	/* Is page being mapped by PTE? Is this its first map to be added? */
 	if (likely(!compound)) {
-		first = atomic_inc_and_test(&page->_mapcount);
-		nr = first;
-		if (first && folio_test_large(folio)) {
-			nr = atomic_inc_return_relaxed(mapped);
-			nr = (nr < COMPOUND_MAPPED);
-		}
+		do {
+			first = atomic_inc_and_test(&page->_mapcount);
+			if (first && folio_test_large(folio)) {
+				first = atomic_inc_return_relaxed(mapped);
+				first = (nr < COMPOUND_MAPPED);
+			}
+
+			if (first)
+				nr++;
+		} while (page++, --nr_pages > 0);
 	} else if (folio_test_pmd_mappable(folio)) {
 		/* That test is redundant: it's for safety or to optimize out */
 
@@ -1354,6 +1362,30 @@  void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
 	mlock_vma_folio(folio, vma, compound);
 }
 
+/**
+ * page_add_file_rmap - add pte mapping to a file page
+ * @page:	the page to add the mapping to
+ * @vma:	the vm area in which the mapping is added
+ * @compound:	charge the page as compound or small page
+ *
+ * The caller needs to hold the pte lock.
+ */
+void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
+		bool compound)
+{
+	struct folio *folio = page_folio(page);
+	unsigned int nr_pages;
+
+	VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page);
+
+	if (likely(!compound))
+		nr_pages = 1;
+	else
+		nr_pages = folio_nr_pages(folio);
+
+	folio_add_file_rmap_range(folio, page, nr_pages, vma, compound);
+}
+
 /**
  * page_remove_rmap - take down pte mapping from a page
  * @page:	page to remove mapping from