diff mbox series

[RFC,07/39] mm/rmap: convert folio_add_file_rmap_range() into folio_add_file_rmap_[pte|ptes|pmd]()

Message ID 20231204142146.91437-8-david@redhat.com (mailing list archive)
State New
Headers show
Series mm/rmap: interface overhaul | expand

Commit Message

David Hildenbrand Dec. 4, 2023, 2:21 p.m. UTC
Let's get rid of the compound parameter and instead define implicitly
which mappings we're adding. That is more future proof, easier to read
and harder to mess up.

Use an enum to express the granularity internally. Make the compiler
always special-case on the granularity by using __always_inline.

Add plenty of sanity checks with CONFIG_DEBUG_VM. Replace the
folio_test_pmd_mappable() check by a config check in the caller and
sanity checks. Convert the single user of folio_add_file_rmap_range().

This function design can later easily be extended to PUDs and to batch
PMDs. Note that for now we don't support anything bigger than
PMD-sized folios (as we cleanly separated hugetlb handling). Sanity checks
will catch if that ever changes.

Next up is removing page_remove_rmap() along with its "compound"
parameter and smilarly converting all other rmap functions.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 include/linux/rmap.h | 47 +++++++++++++++++++++++++++--
 mm/memory.c          |  2 +-
 mm/rmap.c            | 72 ++++++++++++++++++++++++++++----------------
 3 files changed, 92 insertions(+), 29 deletions(-)

Comments

Ryan Roberts Dec. 5, 2023, 12:04 p.m. UTC | #1
On 04/12/2023 14:21, David Hildenbrand wrote:
> Let's get rid of the compound parameter and instead define implicitly
> which mappings we're adding. That is more future proof, easier to read
> and harder to mess up.
> 
> Use an enum to express the granularity internally. Make the compiler
> always special-case on the granularity by using __always_inline.
> 
> Add plenty of sanity checks with CONFIG_DEBUG_VM. Replace the
> folio_test_pmd_mappable() check by a config check in the caller and
> sanity checks. Convert the single user of folio_add_file_rmap_range().
> 
> This function design can later easily be extended to PUDs and to batch
> PMDs. Note that for now we don't support anything bigger than
> PMD-sized folios (as we cleanly separated hugetlb handling). Sanity checks

Is that definitely true? Don't we support PUD-mapping file-backed DAX memory?


> will catch if that ever changes.
> 
> Next up is removing page_remove_rmap() along with its "compound"
> parameter and smilarly converting all other rmap functions.
> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  include/linux/rmap.h | 47 +++++++++++++++++++++++++++--
>  mm/memory.c          |  2 +-
>  mm/rmap.c            | 72 ++++++++++++++++++++++++++++----------------
>  3 files changed, 92 insertions(+), 29 deletions(-)
> 
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 77e336f86c72d..a4a30c361ac50 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -186,6 +186,45 @@ typedef int __bitwise rmap_t;
>   */
>  #define RMAP_COMPOUND		((__force rmap_t)BIT(1))
>  
> +/*
> + * Internally, we're using an enum to specify the granularity. Usually,
> + * we make the compiler create specialized variants for the different
> + * granularity.
> + */
> +enum rmap_mode {
> +	RMAP_MODE_PTE = 0,
> +	RMAP_MODE_PMD,
> +};
> +
> +static inline void __folio_rmap_sanity_checks(struct folio *folio,
> +		struct page *page, unsigned int nr_pages, enum rmap_mode mode)
> +{
> +	/* hugetlb folios are handled separately. */
> +	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
> +	VM_WARN_ON_FOLIO(folio_test_large(folio) &&
> +			 !folio_test_large_rmappable(folio), folio);
> +
> +	VM_WARN_ON_ONCE(!nr_pages || nr_pages > folio_nr_pages(folio));

nit: I don't think you technically need the second half of this - its covered by
the test below...

> +	VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
> +	VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);

...this one.

> +
> +	switch (mode) {
> +	case RMAP_MODE_PTE:
> +		break;
> +	case RMAP_MODE_PMD:
> +		/*
> +		 * We don't support folios larger than a single PMD yet. So
> +		 * when RMAP_MODE_PMD is set, we assume that we are creating
> +		 * a single "entire" mapping of the folio.
> +		 */
> +		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
> +		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
> +		break;
> +	default:
> +		VM_WARN_ON_ONCE(true);
> +	}
> +}
> +
>  /*
>   * rmap interfaces called when adding or removing pte of page
>   */
> @@ -198,8 +237,12 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
>  		unsigned long address);
>  void page_add_file_rmap(struct page *, struct vm_area_struct *,
>  		bool compound);
> -void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
> -		struct vm_area_struct *, bool compound);
> +void folio_add_file_rmap_ptes(struct folio *, struct page *, unsigned int nr,
> +		struct vm_area_struct *);
> +#define folio_add_file_rmap_pte(folio, page, vma) \
> +	folio_add_file_rmap_ptes(folio, page, 1, vma)
> +void folio_add_file_rmap_pmd(struct folio *, struct page *,
> +		struct vm_area_struct *);
>  void page_remove_rmap(struct page *, struct vm_area_struct *,
>  		bool compound);
>  
> diff --git a/mm/memory.c b/mm/memory.c
> index 1f18ed4a54971..15325587cff01 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4414,7 +4414,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
>  		folio_add_lru_vma(folio, vma);
>  	} else {
>  		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
> -		folio_add_file_rmap_range(folio, page, nr, vma, false);
> +		folio_add_file_rmap_ptes(folio, page, nr, vma);
>  	}
>  	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
>  
> diff --git a/mm/rmap.c b/mm/rmap.c
> index a735ecca47a81..1614d98062948 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1334,31 +1334,19 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>  	SetPageAnonExclusive(&folio->page);
>  }
>  
> -/**
> - * folio_add_file_rmap_range - add pte mapping to page range of a folio
> - * @folio:	The folio to add the mapping to
> - * @page:	The first page to add
> - * @nr_pages:	The number of pages which will be mapped
> - * @vma:	the vm area in which the mapping is added
> - * @compound:	charge the page as compound or small page
> - *
> - * The page range of folio is defined by [first_page, first_page + nr_pages)
> - *
> - * The caller needs to hold the pte lock.
> - */
> -void folio_add_file_rmap_range(struct folio *folio, struct page *page,
> -			unsigned int nr_pages, struct vm_area_struct *vma,
> -			bool compound)
> +static __always_inline void __folio_add_file_rmap(struct folio *folio,
> +		struct page *page, unsigned int nr_pages,
> +		struct vm_area_struct *vma, enum rmap_mode mode)
>  {
>  	atomic_t *mapped = &folio->_nr_pages_mapped;
>  	unsigned int nr_pmdmapped = 0, first;
>  	int nr = 0;
>  
> -	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
> -	VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio);
> +	VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
> +	__folio_rmap_sanity_checks(folio, page, nr_pages, mode);
>  
>  	/* Is page being mapped by PTE? Is this its first map to be added? */
> -	if (likely(!compound)) {
> +	if (likely(mode == RMAP_MODE_PTE)) {
>  		do {
>  			first = atomic_inc_and_test(&page->_mapcount);
>  			if (first && folio_test_large(folio)) {
> @@ -1369,9 +1357,7 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page,
>  			if (first)
>  				nr++;
>  		} while (page++, --nr_pages > 0);
> -	} else if (folio_test_pmd_mappable(folio)) {
> -		/* That test is redundant: it's for safety or to optimize out */
> -
> +	} else if (mode == RMAP_MODE_PMD) {
>  		first = atomic_inc_and_test(&folio->_entire_mapcount);
>  		if (first) {
>  			nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped);
> @@ -1399,6 +1385,43 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page,
>  		mlock_vma_folio(folio, vma);
>  }
>  
> +/**
> + * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio
> + * @folio:	The folio to add the mappings to
> + * @page:	The first page to add
> + * @nr_pages:	The number of pages that will be mapped using PTEs
> + * @vma:	The vm area in which the mappings are added
> + *
> + * The page range of the folio is defined by [page, page + nr_pages)
> + *
> + * The caller needs to hold the page table lock.
> + */
> +void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
> +		unsigned int nr_pages, struct vm_area_struct *vma)
> +{
> +	__folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_MODE_PTE);
> +}
> +
> +/**
> + * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio
> + * @folio:	The folio to add the mapping to
> + * @page:	The first page to add
> + * @vma:	The vm area in which the mapping is added
> + *
> + * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
> + *
> + * The caller needs to hold the page table lock.
> + */
> +void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
> +		struct vm_area_struct *vma)
> +{
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	__folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_MODE_PMD);
> +#else
> +	WARN_ON_ONCE(true);
> +#endif
> +}
> +
>  /**
>   * page_add_file_rmap - add pte mapping to a file page
>   * @page:	the page to add the mapping to
> @@ -1411,16 +1434,13 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>  		bool compound)
>  {
>  	struct folio *folio = page_folio(page);
> -	unsigned int nr_pages;
>  
>  	VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page);
>  
>  	if (likely(!compound))
> -		nr_pages = 1;
> +		folio_add_file_rmap_pte(folio, page, vma);
>  	else
> -		nr_pages = folio_nr_pages(folio);
> -
> -	folio_add_file_rmap_range(folio, page, nr_pages, vma, compound);
> +		folio_add_file_rmap_pmd(folio, page, vma);
>  }
>  
>  /**
David Hildenbrand Dec. 5, 2023, 12:25 p.m. UTC | #2
On 05.12.23 13:04, Ryan Roberts wrote:
> On 04/12/2023 14:21, David Hildenbrand wrote:
>> Let's get rid of the compound parameter and instead define implicitly
>> which mappings we're adding. That is more future proof, easier to read
>> and harder to mess up.
>>
>> Use an enum to express the granularity internally. Make the compiler
>> always special-case on the granularity by using __always_inline.
>>
>> Add plenty of sanity checks with CONFIG_DEBUG_VM. Replace the
>> folio_test_pmd_mappable() check by a config check in the caller and
>> sanity checks. Convert the single user of folio_add_file_rmap_range().
>>
>> This function design can later easily be extended to PUDs and to batch
>> PMDs. Note that for now we don't support anything bigger than
>> PMD-sized folios (as we cleanly separated hugetlb handling). Sanity checks
> 
> Is that definitely true? Don't we support PUD-mapping file-backed DAX memory?

They are not handled via the rmap. Otherwise, all the PMD accounting 
(e.g., FilePmdMapped) in RMAP code would already be wrong for them.

And it's easy to verify by looking at zap_huge_pud() that doesn't call 
any rmap code.

[...]

>> +
>> +static inline void __folio_rmap_sanity_checks(struct folio *folio,
>> +		struct page *page, unsigned int nr_pages, enum rmap_mode mode)
>> +{
>> +	/* hugetlb folios are handled separately. */
>> +	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
>> +	VM_WARN_ON_FOLIO(folio_test_large(folio) &&
>> +			 !folio_test_large_rmappable(folio), folio);
>> +
>> +	VM_WARN_ON_ONCE(!nr_pages || nr_pages > folio_nr_pages(folio));
> 
> nit: I don't think you technically need the second half of this - its covered by
> the test below...

My thinking was that if nr_pages would be "-1", one could end up with 
weird wraparounds.

But yeah, I thought about this as well and might just remove it.

> 
>> +	VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
>> +	VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);
> 
> ...this one.
> 

Thanks!
Yin Fengwei Dec. 6, 2023, 1:30 a.m. UTC | #3
On 12/4/23 22:21, David Hildenbrand wrote:
> Let's get rid of the compound parameter and instead define implicitly
> which mappings we're adding. That is more future proof, easier to read
> and harder to mess up.
> 
> Use an enum to express the granularity internally. Make the compiler
> always special-case on the granularity by using __always_inline.
> 
> Add plenty of sanity checks with CONFIG_DEBUG_VM. Replace the
> folio_test_pmd_mappable() check by a config check in the caller and
> sanity checks. Convert the single user of folio_add_file_rmap_range().
> 
> This function design can later easily be extended to PUDs and to batch
> PMDs. Note that for now we don't support anything bigger than
> PMD-sized folios (as we cleanly separated hugetlb handling). Sanity checks
> will catch if that ever changes.
I do have a question for the folio which has larger size than PMD in the
future:
   Will the folio size be only just like PMD size/PUD size? Or it's possible between
   PUD size and PMD size?

   If it's possible between PUD size and PMD size, will the mapping be mixed PMD mapping
   and PTE mapping or just PTE mapping. I suppose it could be mixed because of efficiency
   of page walker.

It may just be too early to consider this now.

Regards
Yin, Fengwei

> 
> Next up is removing page_remove_rmap() along with its "compound"
> parameter and smilarly converting all other rmap functions.
> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  include/linux/rmap.h | 47 +++++++++++++++++++++++++++--
>  mm/memory.c          |  2 +-
>  mm/rmap.c            | 72 ++++++++++++++++++++++++++++----------------
>  3 files changed, 92 insertions(+), 29 deletions(-)
> 
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 77e336f86c72d..a4a30c361ac50 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -186,6 +186,45 @@ typedef int __bitwise rmap_t;
>   */
>  #define RMAP_COMPOUND		((__force rmap_t)BIT(1))
>  
> +/*
> + * Internally, we're using an enum to specify the granularity. Usually,
> + * we make the compiler create specialized variants for the different
> + * granularity.
> + */
> +enum rmap_mode {
> +	RMAP_MODE_PTE = 0,
> +	RMAP_MODE_PMD,
> +};
> +
> +static inline void __folio_rmap_sanity_checks(struct folio *folio,
> +		struct page *page, unsigned int nr_pages, enum rmap_mode mode)
> +{
> +	/* hugetlb folios are handled separately. */
> +	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
> +	VM_WARN_ON_FOLIO(folio_test_large(folio) &&
> +			 !folio_test_large_rmappable(folio), folio);
> +
> +	VM_WARN_ON_ONCE(!nr_pages || nr_pages > folio_nr_pages(folio));
> +	VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
> +	VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);
> +
> +	switch (mode) {
> +	case RMAP_MODE_PTE:
> +		break;
> +	case RMAP_MODE_PMD:
> +		/*
> +		 * We don't support folios larger than a single PMD yet. So
> +		 * when RMAP_MODE_PMD is set, we assume that we are creating
> +		 * a single "entire" mapping of the folio.
> +		 */
> +		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
> +		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
> +		break;
> +	default:
> +		VM_WARN_ON_ONCE(true);
> +	}
> +}
> +
>  /*
>   * rmap interfaces called when adding or removing pte of page
>   */
> @@ -198,8 +237,12 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
>  		unsigned long address);
>  void page_add_file_rmap(struct page *, struct vm_area_struct *,
>  		bool compound);
> -void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
> -		struct vm_area_struct *, bool compound);
> +void folio_add_file_rmap_ptes(struct folio *, struct page *, unsigned int nr,
> +		struct vm_area_struct *);
> +#define folio_add_file_rmap_pte(folio, page, vma) \
> +	folio_add_file_rmap_ptes(folio, page, 1, vma)
> +void folio_add_file_rmap_pmd(struct folio *, struct page *,
> +		struct vm_area_struct *);
>  void page_remove_rmap(struct page *, struct vm_area_struct *,
>  		bool compound);
>  
> diff --git a/mm/memory.c b/mm/memory.c
> index 1f18ed4a54971..15325587cff01 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4414,7 +4414,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
>  		folio_add_lru_vma(folio, vma);
>  	} else {
>  		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
> -		folio_add_file_rmap_range(folio, page, nr, vma, false);
> +		folio_add_file_rmap_ptes(folio, page, nr, vma);
>  	}
>  	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
>  
> diff --git a/mm/rmap.c b/mm/rmap.c
> index a735ecca47a81..1614d98062948 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1334,31 +1334,19 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
>  	SetPageAnonExclusive(&folio->page);
>  }
>  
> -/**
> - * folio_add_file_rmap_range - add pte mapping to page range of a folio
> - * @folio:	The folio to add the mapping to
> - * @page:	The first page to add
> - * @nr_pages:	The number of pages which will be mapped
> - * @vma:	the vm area in which the mapping is added
> - * @compound:	charge the page as compound or small page
> - *
> - * The page range of folio is defined by [first_page, first_page + nr_pages)
> - *
> - * The caller needs to hold the pte lock.
> - */
> -void folio_add_file_rmap_range(struct folio *folio, struct page *page,
> -			unsigned int nr_pages, struct vm_area_struct *vma,
> -			bool compound)
> +static __always_inline void __folio_add_file_rmap(struct folio *folio,
> +		struct page *page, unsigned int nr_pages,
> +		struct vm_area_struct *vma, enum rmap_mode mode)
>  {
>  	atomic_t *mapped = &folio->_nr_pages_mapped;
>  	unsigned int nr_pmdmapped = 0, first;
>  	int nr = 0;
>  
> -	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
> -	VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio);
> +	VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
> +	__folio_rmap_sanity_checks(folio, page, nr_pages, mode);
>  
>  	/* Is page being mapped by PTE? Is this its first map to be added? */
> -	if (likely(!compound)) {
> +	if (likely(mode == RMAP_MODE_PTE)) {
>  		do {
>  			first = atomic_inc_and_test(&page->_mapcount);
>  			if (first && folio_test_large(folio)) {
> @@ -1369,9 +1357,7 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page,
>  			if (first)
>  				nr++;
>  		} while (page++, --nr_pages > 0);
> -	} else if (folio_test_pmd_mappable(folio)) {
> -		/* That test is redundant: it's for safety or to optimize out */
> -
> +	} else if (mode == RMAP_MODE_PMD) {
>  		first = atomic_inc_and_test(&folio->_entire_mapcount);
>  		if (first) {
>  			nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped);
> @@ -1399,6 +1385,43 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page,
>  		mlock_vma_folio(folio, vma);
>  }
>  
> +/**
> + * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio
> + * @folio:	The folio to add the mappings to
> + * @page:	The first page to add
> + * @nr_pages:	The number of pages that will be mapped using PTEs
> + * @vma:	The vm area in which the mappings are added
> + *
> + * The page range of the folio is defined by [page, page + nr_pages)
> + *
> + * The caller needs to hold the page table lock.
> + */
> +void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
> +		unsigned int nr_pages, struct vm_area_struct *vma)
> +{
> +	__folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_MODE_PTE);
> +}
> +
> +/**
> + * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio
> + * @folio:	The folio to add the mapping to
> + * @page:	The first page to add
> + * @vma:	The vm area in which the mapping is added
> + *
> + * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
> + *
> + * The caller needs to hold the page table lock.
> + */
> +void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
> +		struct vm_area_struct *vma)
> +{
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +	__folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_MODE_PMD);
> +#else
> +	WARN_ON_ONCE(true);
> +#endif
> +}
> +
>  /**
>   * page_add_file_rmap - add pte mapping to a file page
>   * @page:	the page to add the mapping to
> @@ -1411,16 +1434,13 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
>  		bool compound)
>  {
>  	struct folio *folio = page_folio(page);
> -	unsigned int nr_pages;
>  
>  	VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page);
>  
>  	if (likely(!compound))
> -		nr_pages = 1;
> +		folio_add_file_rmap_pte(folio, page, vma);
>  	else
> -		nr_pages = folio_nr_pages(folio);
> -
> -	folio_add_file_rmap_range(folio, page, nr_pages, vma, compound);
> +		folio_add_file_rmap_pmd(folio, page, vma);
>  }
>  
>  /**
David Hildenbrand Dec. 6, 2023, 9:17 a.m. UTC | #4
On 06.12.23 02:30, Yin Fengwei wrote:
> 
> 
> On 12/4/23 22:21, David Hildenbrand wrote:
>> Let's get rid of the compound parameter and instead define implicitly
>> which mappings we're adding. That is more future proof, easier to read
>> and harder to mess up.
>>
>> Use an enum to express the granularity internally. Make the compiler
>> always special-case on the granularity by using __always_inline.
>>
>> Add plenty of sanity checks with CONFIG_DEBUG_VM. Replace the
>> folio_test_pmd_mappable() check by a config check in the caller and
>> sanity checks. Convert the single user of folio_add_file_rmap_range().
>>
>> This function design can later easily be extended to PUDs and to batch
>> PMDs. Note that for now we don't support anything bigger than
>> PMD-sized folios (as we cleanly separated hugetlb handling). Sanity checks
>> will catch if that ever changes.
> I do have a question for the folio which has larger size than PMD in the
> future:
>     Will the folio size be only just like PMD size/PUD size? Or it's possible between
>     PUD size and PMD size?

I strongly assume that we'll see in the future folios larger than a 
single PMD (for example, 4 MiB on x86-64).

This will require quite some care in other areas (and this series, as it 
converts some PMD handling function to folios, further prepares for that).

> 
>     If it's possible between PUD size and PMD size, will the mapping be mixed PMD mapping
>     and PTE mapping or just PTE mapping. I suppose it could be mixed because of efficiency
>     of page walker.

Depending on with which alignment such larger folios are mapped into the 
page tables and some other factors, we might indeed end up having parts 
of the folio mapped by PMDs and parts by PTEs. Well, and once we involve 
PUDs we might have a mixture of all of these :)

The current API here will be able to deal with that (excluding the _pud 
variant). To improve performance, we might want PMD batching and have 
_pmds functions.

We'll have to tweak the rmap internals to do the rmap accounting 
properly then (and the sanity checks will catch any of that and 
highlight the need for rmap-internal extensions); maybe once we come to 
that, we no longer have these subpage mapcounts, but we'll have to see 
if/when/how that happens.

> 
> It may just be too early to consider this now.
> 

I had that in mind while working on this. I assume it will take some 
more time to handle everything else that needs to be prepared for that, 
but the rmap interface should be able to handle that, only the internals 
will have to be extended.


Thanks!
diff mbox series

Patch

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 77e336f86c72d..a4a30c361ac50 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -186,6 +186,45 @@  typedef int __bitwise rmap_t;
  */
 #define RMAP_COMPOUND		((__force rmap_t)BIT(1))
 
+/*
+ * Internally, we're using an enum to specify the granularity. Usually,
+ * we make the compiler create specialized variants for the different
+ * granularity.
+ */
+enum rmap_mode {
+	RMAP_MODE_PTE = 0,
+	RMAP_MODE_PMD,
+};
+
+static inline void __folio_rmap_sanity_checks(struct folio *folio,
+		struct page *page, unsigned int nr_pages, enum rmap_mode mode)
+{
+	/* hugetlb folios are handled separately. */
+	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
+	VM_WARN_ON_FOLIO(folio_test_large(folio) &&
+			 !folio_test_large_rmappable(folio), folio);
+
+	VM_WARN_ON_ONCE(!nr_pages || nr_pages > folio_nr_pages(folio));
+	VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
+	VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);
+
+	switch (mode) {
+	case RMAP_MODE_PTE:
+		break;
+	case RMAP_MODE_PMD:
+		/*
+		 * We don't support folios larger than a single PMD yet. So
+		 * when RMAP_MODE_PMD is set, we assume that we are creating
+		 * a single "entire" mapping of the folio.
+		 */
+		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
+		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
+		break;
+	default:
+		VM_WARN_ON_ONCE(true);
+	}
+}
+
 /*
  * rmap interfaces called when adding or removing pte of page
  */
@@ -198,8 +237,12 @@  void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 void page_add_file_rmap(struct page *, struct vm_area_struct *,
 		bool compound);
-void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr,
-		struct vm_area_struct *, bool compound);
+void folio_add_file_rmap_ptes(struct folio *, struct page *, unsigned int nr,
+		struct vm_area_struct *);
+#define folio_add_file_rmap_pte(folio, page, vma) \
+	folio_add_file_rmap_ptes(folio, page, 1, vma)
+void folio_add_file_rmap_pmd(struct folio *, struct page *,
+		struct vm_area_struct *);
 void page_remove_rmap(struct page *, struct vm_area_struct *,
 		bool compound);
 
diff --git a/mm/memory.c b/mm/memory.c
index 1f18ed4a54971..15325587cff01 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4414,7 +4414,7 @@  void set_pte_range(struct vm_fault *vmf, struct folio *folio,
 		folio_add_lru_vma(folio, vma);
 	} else {
 		add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
-		folio_add_file_rmap_range(folio, page, nr, vma, false);
+		folio_add_file_rmap_ptes(folio, page, nr, vma);
 	}
 	set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
 
diff --git a/mm/rmap.c b/mm/rmap.c
index a735ecca47a81..1614d98062948 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1334,31 +1334,19 @@  void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
 	SetPageAnonExclusive(&folio->page);
 }
 
-/**
- * folio_add_file_rmap_range - add pte mapping to page range of a folio
- * @folio:	The folio to add the mapping to
- * @page:	The first page to add
- * @nr_pages:	The number of pages which will be mapped
- * @vma:	the vm area in which the mapping is added
- * @compound:	charge the page as compound or small page
- *
- * The page range of folio is defined by [first_page, first_page + nr_pages)
- *
- * The caller needs to hold the pte lock.
- */
-void folio_add_file_rmap_range(struct folio *folio, struct page *page,
-			unsigned int nr_pages, struct vm_area_struct *vma,
-			bool compound)
+static __always_inline void __folio_add_file_rmap(struct folio *folio,
+		struct page *page, unsigned int nr_pages,
+		struct vm_area_struct *vma, enum rmap_mode mode)
 {
 	atomic_t *mapped = &folio->_nr_pages_mapped;
 	unsigned int nr_pmdmapped = 0, first;
 	int nr = 0;
 
-	VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
-	VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio);
+	VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);
+	__folio_rmap_sanity_checks(folio, page, nr_pages, mode);
 
 	/* Is page being mapped by PTE? Is this its first map to be added? */
-	if (likely(!compound)) {
+	if (likely(mode == RMAP_MODE_PTE)) {
 		do {
 			first = atomic_inc_and_test(&page->_mapcount);
 			if (first && folio_test_large(folio)) {
@@ -1369,9 +1357,7 @@  void folio_add_file_rmap_range(struct folio *folio, struct page *page,
 			if (first)
 				nr++;
 		} while (page++, --nr_pages > 0);
-	} else if (folio_test_pmd_mappable(folio)) {
-		/* That test is redundant: it's for safety or to optimize out */
-
+	} else if (mode == RMAP_MODE_PMD) {
 		first = atomic_inc_and_test(&folio->_entire_mapcount);
 		if (first) {
 			nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped);
@@ -1399,6 +1385,43 @@  void folio_add_file_rmap_range(struct folio *folio, struct page *page,
 		mlock_vma_folio(folio, vma);
 }
 
+/**
+ * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio
+ * @folio:	The folio to add the mappings to
+ * @page:	The first page to add
+ * @nr_pages:	The number of pages that will be mapped using PTEs
+ * @vma:	The vm area in which the mappings are added
+ *
+ * The page range of the folio is defined by [page, page + nr_pages)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
+		unsigned int nr_pages, struct vm_area_struct *vma)
+{
+	__folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_MODE_PTE);
+}
+
+/**
+ * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio
+ * @folio:	The folio to add the mapping to
+ * @page:	The first page to add
+ * @vma:	The vm area in which the mapping is added
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
+		struct vm_area_struct *vma)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	__folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_MODE_PMD);
+#else
+	WARN_ON_ONCE(true);
+#endif
+}
+
 /**
  * page_add_file_rmap - add pte mapping to a file page
  * @page:	the page to add the mapping to
@@ -1411,16 +1434,13 @@  void page_add_file_rmap(struct page *page, struct vm_area_struct *vma,
 		bool compound)
 {
 	struct folio *folio = page_folio(page);
-	unsigned int nr_pages;
 
 	VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page);
 
 	if (likely(!compound))
-		nr_pages = 1;
+		folio_add_file_rmap_pte(folio, page, vma);
 	else
-		nr_pages = folio_nr_pages(folio);
-
-	folio_add_file_rmap_range(folio, page, nr_pages, vma, compound);
+		folio_add_file_rmap_pmd(folio, page, vma);
 }
 
 /**