diff mbox series

[07/13] huge_memory: Allow mappings of PUD sized pages

Message ID bd332b0d3971b03152b3541f97470817c5147b51.1719386613.git-series.apopple@nvidia.com
State New
Headers show
Series fs/dax: Fix FS DAX page reference counts | expand

Commit Message

Alistair Popple June 27, 2024, 12:54 a.m. UTC
Currently DAX folio/page reference counts are managed differently to
normal pages. To allow these to be managed the same as normal pages
introduce dax_insert_pfn_pud. This will map the entire PUD-sized folio
and take references as it would for a normally mapped page.

This is distinct from the current mechanism, vmf_insert_pfn_pud, which
simply inserts a special devmap PUD entry into the page table without
holding a reference to the page for the mapping.

Signed-off-by: Alistair Popple <apopple@nvidia.com>
---
 include/linux/huge_mm.h |   4 ++-
 include/linux/rmap.h    |  14 +++++-
 mm/huge_memory.c        | 108 ++++++++++++++++++++++++++++++++++++++---
 mm/rmap.c               |  48 ++++++++++++++++++-
 4 files changed, 168 insertions(+), 6 deletions(-)

Comments

kernel test robot June 27, 2024, 10:26 p.m. UTC | #1
Hi Alistair,

kernel test robot noticed the following build errors:

[auto build test ERROR on f2661062f16b2de5d7b6a5c42a9a5c96326b8454]

url:    https://github.com/intel-lab-lkp/linux/commits/Alistair-Popple/mm-gup-c-Remove-redundant-check-for-PCI-P2PDMA-page/20240627-191709
base:   f2661062f16b2de5d7b6a5c42a9a5c96326b8454
patch link:    https://lore.kernel.org/r/bd332b0d3971b03152b3541f97470817c5147b51.1719386613.git-series.apopple%40nvidia.com
patch subject: [PATCH 07/13] huge_memory: Allow mappings of PUD sized pages
config: x86_64-allnoconfig (https://download.01.org/0day-ci/archive/20240628/202406280637.147dyRrV-lkp@intel.com/config)
compiler: clang version 18.1.5 (https://github.com/llvm/llvm-project 617a15a9eac96088ae5e9134248d8236e34b91b1)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240628/202406280637.147dyRrV-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202406280637.147dyRrV-lkp@intel.com/

All errors (new ones prefixed by >>):

>> mm/rmap.c:1513:37: error: call to '__compiletime_assert_279' declared with 'error' attribute: BUILD_BUG failed
    1513 |         __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
         |                                            ^
   include/linux/huge_mm.h:111:26: note: expanded from macro 'HPAGE_PUD_NR'
     111 | #define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER)
         |                          ^
   include/linux/huge_mm.h:110:26: note: expanded from macro 'HPAGE_PUD_ORDER'
     110 | #define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT)
         |                          ^
   include/linux/huge_mm.h:97:28: note: expanded from macro 'HPAGE_PUD_SHIFT'
      97 | #define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
         |                            ^
   note: (skipping 3 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
   include/linux/compiler_types.h:498:2: note: expanded from macro '_compiletime_assert'
     498 |         __compiletime_assert(condition, msg, prefix, suffix)
         |         ^
   include/linux/compiler_types.h:491:4: note: expanded from macro '__compiletime_assert'
     491 |                         prefix ## suffix();                             \
         |                         ^
   <scratch space>:72:1: note: expanded from here
      72 | __compiletime_assert_279
         | ^
   mm/rmap.c:1660:35: error: call to '__compiletime_assert_280' declared with 'error' attribute: BUILD_BUG failed
    1660 |         __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
         |                                          ^
   include/linux/huge_mm.h:111:26: note: expanded from macro 'HPAGE_PUD_NR'
     111 | #define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER)
         |                          ^
   include/linux/huge_mm.h:110:26: note: expanded from macro 'HPAGE_PUD_ORDER'
     110 | #define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT)
         |                          ^
   include/linux/huge_mm.h:97:28: note: expanded from macro 'HPAGE_PUD_SHIFT'
      97 | #define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
         |                            ^
   note: (skipping 3 expansions in backtrace; use -fmacro-backtrace-limit=0 to see all)
   include/linux/compiler_types.h:498:2: note: expanded from macro '_compiletime_assert'
     498 |         __compiletime_assert(condition, msg, prefix, suffix)
         |         ^
   include/linux/compiler_types.h:491:4: note: expanded from macro '__compiletime_assert'
     491 |                         prefix ## suffix();                             \
         |                         ^
   <scratch space>:79:1: note: expanded from here
      79 | __compiletime_assert_280
         | ^
   2 errors generated.


vim +1513 mm/rmap.c

  1498	
  1499	/**
  1500	 * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
  1501	 * @folio:	The folio to add the mapping to
  1502	 * @page:	The first page to add
  1503	 * @vma:	The vm area in which the mapping is added
  1504	 *
  1505	 * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
  1506	 *
  1507	 * The caller needs to hold the page table lock.
  1508	 */
  1509	void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
  1510			struct vm_area_struct *vma)
  1511	{
  1512	#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
> 1513		__folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
  1514	#else
  1515		WARN_ON_ONCE(true);
  1516	#endif
  1517	}
  1518
David Hildenbrand July 2, 2024, 7:16 a.m. UTC | #2
On 27.06.24 02:54, Alistair Popple wrote:
> Currently DAX folio/page reference counts are managed differently to
> normal pages. To allow these to be managed the same as normal pages
> introduce dax_insert_pfn_pud. This will map the entire PUD-sized folio
> and take references as it would for a normally mapped page.
> 
> This is distinct from the current mechanism, vmf_insert_pfn_pud, which
> simply inserts a special devmap PUD entry into the page table without
> holding a reference to the page for the mapping.

Do we really have to involve mapcounts/rmap for daxfs pages at this 
point? Or is this only "to make it look more like other pages" ?

I'm asking this because:

(A) We don't support mixing PUD+PMD mappings yet. I have plans to change
     that in the future, but for now you can only map using a single PUD
     or by PTEs. I suspect that's good enoug for now for dax fs?
(B) As long as we have subpage mapcounts, this prevents vmemmap
     optimizations [1]. Is that only used for device-dax for now and are
     there no plans to make use of that for fs-dax?
(C) We managed without so far :)


Having that said, with folio->_large_mapcount things like 
folio_mapcount() are no longer terribly slow once we weould PTE-map a 
PUD-sized folio.

Also, all ZONE_DEVICE pages should currently be marked PG_reserved, 
translating to "don't touch the memmap". I think we might want to tackle 
that first.

[1] https://lwn.net/Articles/860218/
[2] 
https://lkml.kernel.org/r/b0adbb0c-ad59-4bc5-ba0b-0af464b94557@redhat.com

> 
> Signed-off-by: Alistair Popple <apopple@nvidia.com>
> ---
>   include/linux/huge_mm.h |   4 ++-
>   include/linux/rmap.h    |  14 +++++-
>   mm/huge_memory.c        | 108 ++++++++++++++++++++++++++++++++++++++---
>   mm/rmap.c               |  48 ++++++++++++++++++-
>   4 files changed, 168 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 2aa986a..b98a3cc 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -39,6 +39,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>   
>   vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
>   vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
> +vm_fault_t dax_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
>   
>   enum transparent_hugepage_flag {
>   	TRANSPARENT_HUGEPAGE_UNSUPPORTED,
> @@ -106,6 +107,9 @@ extern struct kobj_attribute shmem_enabled_attr;
>   #define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
>   #define HPAGE_PUD_SIZE	((1UL) << HPAGE_PUD_SHIFT)
>   
> +#define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT)
> +#define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER)
> +
>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>   
>   extern unsigned long transparent_hugepage_flags;
> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
> index 7229b9b..c5a0205 100644
> --- a/include/linux/rmap.h
> +++ b/include/linux/rmap.h
> @@ -192,6 +192,7 @@ typedef int __bitwise rmap_t;
>   enum rmap_level {
>   	RMAP_LEVEL_PTE = 0,
>   	RMAP_LEVEL_PMD,
> +	RMAP_LEVEL_PUD,
>   };
>   
>   static inline void __folio_rmap_sanity_checks(struct folio *folio,
> @@ -225,6 +226,13 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio,
>   		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
>   		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
>   		break;
> +	case RMAP_LEVEL_PUD:
> +		/*
> +		 * Asume that we are creating * a single "entire" mapping of the folio.
> +		 */
> +		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio);
> +		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
> +		break;
>   	default:
>   		VM_WARN_ON_ONCE(true);
>   	}
> @@ -248,12 +256,16 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
>   	folio_add_file_rmap_ptes(folio, page, 1, vma)
>   void folio_add_file_rmap_pmd(struct folio *, struct page *,
>   		struct vm_area_struct *);
> +void folio_add_file_rmap_pud(struct folio *, struct page *,
> +		struct vm_area_struct *);
>   void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
>   		struct vm_area_struct *);
>   #define folio_remove_rmap_pte(folio, page, vma) \
>   	folio_remove_rmap_ptes(folio, page, 1, vma)
>   void folio_remove_rmap_pmd(struct folio *, struct page *,
>   		struct vm_area_struct *);
> +void folio_remove_rmap_pud(struct folio *, struct page *,
> +		struct vm_area_struct *);
>   
>   void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
>   		unsigned long address, rmap_t flags);
> @@ -338,6 +350,7 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
>   		atomic_add(orig_nr_pages, &folio->_large_mapcount);
>   		break;
>   	case RMAP_LEVEL_PMD:
> +	case RMAP_LEVEL_PUD:
>   		atomic_inc(&folio->_entire_mapcount);
>   		atomic_inc(&folio->_large_mapcount);
>   		break;
> @@ -434,6 +447,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
>   		atomic_add(orig_nr_pages, &folio->_large_mapcount);
>   		break;
>   	case RMAP_LEVEL_PMD:
> +	case RMAP_LEVEL_PUD:
>   		if (PageAnonExclusive(page)) {
>   			if (unlikely(maybe_pinned))
>   				return -EBUSY;
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index db7946a..e1f053e 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1283,6 +1283,70 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
>   	return VM_FAULT_NOPAGE;
>   }
>   EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
> +
> +/**
> + * dax_insert_pfn_pud - insert a pud size pfn backed by a normal page
> + * @vmf: Structure describing the fault
> + * @pfn: pfn of the page to insert
> + * @write: whether it's a write fault
> + *
> + * Return: vm_fault_t value.
> + */
> +vm_fault_t dax_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
> +{
> +	struct vm_area_struct *vma = vmf->vma;
> +	unsigned long addr = vmf->address & PUD_MASK;
> +	pud_t *pud = vmf->pud;
> +	pgprot_t prot = vma->vm_page_prot;
> +	struct mm_struct *mm = vma->vm_mm;
> +	pud_t entry;
> +	spinlock_t *ptl;
> +	struct folio *folio;
> +	struct page *page;
> +
> +	if (addr < vma->vm_start || addr >= vma->vm_end)
> +		return VM_FAULT_SIGBUS;
> +
> +	track_pfn_insert(vma, &prot, pfn);
> +
> +	ptl = pud_lock(mm, pud);
> +	if (!pud_none(*pud)) {
> +		if (write) {
> +			if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
> +				WARN_ON_ONCE(!is_huge_zero_pud(*pud));
> +				goto out_unlock;
> +			}
> +			entry = pud_mkyoung(*pud);
> +			entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
> +			if (pudp_set_access_flags(vma, addr, pud, entry, 1))
> +				update_mmu_cache_pud(vma, addr, pud);
> +		}
> +		goto out_unlock;
> +	}
> +
> +	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
> +	if (pfn_t_devmap(pfn))
> +		entry = pud_mkdevmap(entry);
> +	if (write) {
> +		entry = pud_mkyoung(pud_mkdirty(entry));
> +		entry = maybe_pud_mkwrite(entry, vma);
> +	}
> +
> +	page = pfn_t_to_page(pfn);
> +	folio = page_folio(page);
> +	folio_get(folio);
> +	folio_add_file_rmap_pud(folio, page, vma);
> +	add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);
> +
> +	set_pud_at(mm, addr, pud, entry);
> +	update_mmu_cache_pud(vma, addr, pud);
> +
> +out_unlock:
> +	spin_unlock(ptl);
> +
> +	return VM_FAULT_NOPAGE;
> +}
> +EXPORT_SYMBOL_GPL(dax_insert_pfn_pud);
>   #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
>   
>   void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
> @@ -1836,7 +1900,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>   			zap_deposited_table(tlb->mm, pmd);
>   		spin_unlock(ptl);
>   	} else if (is_huge_zero_pmd(orig_pmd)) {
> -		zap_deposited_table(tlb->mm, pmd);
> +		if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
> +			zap_deposited_table(tlb->mm, pmd);
>   		spin_unlock(ptl);
>   	} else {
>   		struct folio *folio = NULL;
> @@ -2268,20 +2333,34 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
>   int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
>   		 pud_t *pud, unsigned long addr)
>   {
> +	pud_t orig_pud;
>   	spinlock_t *ptl;
>   
>   	ptl = __pud_trans_huge_lock(pud, vma);
>   	if (!ptl)
>   		return 0;
>   
> -	pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
> +	orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
>   	tlb_remove_pud_tlb_entry(tlb, pud, addr);
> -	if (vma_is_special_huge(vma)) {
> +	if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
>   		spin_unlock(ptl);
>   		/* No zero page support yet */
>   	} else {
> -		/* No support for anonymous PUD pages yet */
> -		BUG();
> +		struct page *page = NULL;
> +		struct folio *folio;
> +
> +		/* No support for anonymous PUD pages or migration yet */
> +		BUG_ON(vma_is_anonymous(vma) || !pud_present(orig_pud));
> +
> +		page = pud_page(orig_pud);
> +		folio = page_folio(page);
> +		folio_remove_rmap_pud(folio, page, vma);
> +		VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
> +		VM_BUG_ON_PAGE(!PageHead(page), page);
> +		add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);
> +
> +		spin_unlock(ptl);
> +		tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
>   	}
>   	return 1;
>   }
> @@ -2289,6 +2368,8 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
>   static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
>   		unsigned long haddr)
>   {
> +	pud_t old_pud;
> +
>   	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
>   	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
>   	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
> @@ -2296,7 +2377,22 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
>   
>   	count_vm_event(THP_SPLIT_PUD);
>   
> -	pudp_huge_clear_flush(vma, haddr, pud);
> +	old_pud = pudp_huge_clear_flush(vma, haddr, pud);
> +	if (is_huge_zero_pud(old_pud))
> +		return;
> +
> +	if (vma_is_dax(vma)) {
> +		struct page *page = pud_page(old_pud);
> +		struct folio *folio = page_folio(page);
> +
> +		if (!folio_test_dirty(folio) && pud_dirty(old_pud))
> +			folio_mark_dirty(folio);
> +		if (!folio_test_referenced(folio) && pud_young(old_pud))
> +			folio_set_referenced(folio);
> +		folio_remove_rmap_pud(folio, page, vma);
> +		folio_put(folio);
> +		add_mm_counter(vma->vm_mm, mm_counter_file(folio), -HPAGE_PUD_NR);
> +	}
>   }
>   
>   void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
> diff --git a/mm/rmap.c b/mm/rmap.c
> index e8fc5ec..e949e4f 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1165,6 +1165,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
>   		atomic_add(orig_nr_pages, &folio->_large_mapcount);
>   		break;
>   	case RMAP_LEVEL_PMD:
> +	case RMAP_LEVEL_PUD:
>   		first = atomic_inc_and_test(&folio->_entire_mapcount);
>   		if (first) {
>   			nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
> @@ -1306,6 +1307,12 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio,
>   		case RMAP_LEVEL_PMD:
>   			SetPageAnonExclusive(page);
>   			break;
> +		case RMAP_LEVEL_PUD:
> +			/*
> +			 * Keep the compiler happy, we don't support anonymous PUD mappings.
> +			 */
> +			WARN_ON_ONCE(1);
> +			break;
>   		}
>   	}
>   	for (i = 0; i < nr_pages; i++) {
> @@ -1489,6 +1496,26 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
>   #endif
>   }
>   
> +/**
> + * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
> + * @folio:	The folio to add the mapping to
> + * @page:	The first page to add
> + * @vma:	The vm area in which the mapping is added
> + *
> + * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
> + *
> + * The caller needs to hold the page table lock.
> + */
> +void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
> +		struct vm_area_struct *vma)
> +{
> +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
> +	__folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
> +#else
> +	WARN_ON_ONCE(true);
> +#endif
> +}
> +
>   static __always_inline void __folio_remove_rmap(struct folio *folio,
>   		struct page *page, int nr_pages, struct vm_area_struct *vma,
>   		enum rmap_level level)
> @@ -1521,6 +1548,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>   		partially_mapped = nr && atomic_read(mapped);
>   		break;
>   	case RMAP_LEVEL_PMD:
> +	case RMAP_LEVEL_PUD:
>   		atomic_dec(&folio->_large_mapcount);
>   		last = atomic_add_negative(-1, &folio->_entire_mapcount);
>   		if (last) {
> @@ -1615,6 +1643,26 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
>   #endif
>   }
>   
> +/**
> + * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio
> + * @folio:	The folio to remove the mapping from
> + * @page:	The first page to remove
> + * @vma:	The vm area from which the mapping is removed
> + *
> + * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
> + *
> + * The caller needs to hold the page table lock.
> + */
> +void folio_remove_rmap_pud(struct folio *folio, struct page *page,
> +		struct vm_area_struct *vma)
> +{
> +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
> +	__folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
> +#else
> +	WARN_ON_ONCE(true);
> +#endif
> +}
> +
>   /*
>    * @arg: enum ttu_flags will be passed to this argument
>    */
Alistair Popple July 2, 2024, 10:19 a.m. UTC | #3
David Hildenbrand <david@redhat.com> writes:

> On 27.06.24 02:54, Alistair Popple wrote:
>> Currently DAX folio/page reference counts are managed differently to
>> normal pages. To allow these to be managed the same as normal pages
>> introduce dax_insert_pfn_pud. This will map the entire PUD-sized folio
>> and take references as it would for a normally mapped page.
>> This is distinct from the current mechanism, vmf_insert_pfn_pud,
>> which
>> simply inserts a special devmap PUD entry into the page table without
>> holding a reference to the page for the mapping.
>
> Do we really have to involve mapcounts/rmap for daxfs pages at this
> point? Or is this only "to make it look more like other pages" ?

The aim of the series is make FS DAX and other ZONE_DEVICE pages look
like other pages, at least with regards to the way they are refcounted.

At the moment they are not refcounted - instead their refcounts are
basically statically initialised to one and there are all these special
cases and functions requiring magic PTE bits (pXX_devmap) to do the
special DAX reference counting. This then adds some cruft to manage
pgmap references and to catch the 2->1 page refcount transition. All
this just goes away if we manage the page references the same as other
pages (and indeed we already manage DEVICE_PRIVATE and COHERENT pages
the same as normal pages).

So I think to make this work we at least need the mapcounts.

> I'm asking this because:
>
> (A) We don't support mixing PUD+PMD mappings yet. I have plans to change
>     that in the future, but for now you can only map using a single PUD
>     or by PTEs. I suspect that's good enoug for now for dax fs?

Yep, that's all we support.

> (B) As long as we have subpage mapcounts, this prevents vmemmap
>     optimizations [1]. Is that only used for device-dax for now and are
>     there no plans to make use of that for fs-dax?

I don't have any plans to. This is purely focussed on refcounting pages
"like normal" so we can get rid of all the DAX special casing.

> (C) We managed without so far :)

Indeed, although Christoph has asked repeatedly ([1], [2] and likely
others) that this gets fixed and I finally got sick of it coming up
everytime I need to touch something with ZONE_DEVICE pages :)

Also it removes the need for people to understand the special DAX page
recounting scheme and ends up removing a bunch of cruft as a bonus:

 59 files changed, 485 insertions(+), 869 deletions(-)

And that's before I clean up all the pgmap reference handling. It also
removes the pXX_trans_huge and pXX_leaf distinction. So we managed, but
things could be better IMHO.

> Having that said, with folio->_large_mapcount things like
> folio_mapcount() are no longer terribly slow once we weould PTE-map a
> PUD-sized folio.
>
> Also, all ZONE_DEVICE pages should currently be marked PG_reserved,
> translating to "don't touch the memmap". I think we might want to
> tackle that first.

Ok. I'm keen to get this series finished and I don't quite get the
connection here, what needs to change there?

[1] - https://lore.kernel.org/linux-mm/20201106080322.GE31341@lst.de/
[2] - https://lore.kernel.org/linux-mm/20220209135351.GA20631@lst.de/

>
> [1] https://lwn.net/Articles/860218/
> [2]
> https://lkml.kernel.org/r/b0adbb0c-ad59-4bc5-ba0b-0af464b94557@redhat.com
>
>> Signed-off-by: Alistair Popple <apopple@nvidia.com>
>> ---
>>   include/linux/huge_mm.h |   4 ++-
>>   include/linux/rmap.h    |  14 +++++-
>>   mm/huge_memory.c        | 108 ++++++++++++++++++++++++++++++++++++++---
>>   mm/rmap.c               |  48 ++++++++++++++++++-
>>   4 files changed, 168 insertions(+), 6 deletions(-)
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 2aa986a..b98a3cc 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -39,6 +39,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>     vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn,
>> bool write);
>>   vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
>> +vm_fault_t dax_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
>>     enum transparent_hugepage_flag {
>>   	TRANSPARENT_HUGEPAGE_UNSUPPORTED,
>> @@ -106,6 +107,9 @@ extern struct kobj_attribute shmem_enabled_attr;
>>   #define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
>>   #define HPAGE_PUD_SIZE	((1UL) << HPAGE_PUD_SHIFT)
>>   +#define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT)
>> +#define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER)
>> +
>>   #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>     extern unsigned long transparent_hugepage_flags;
>> diff --git a/include/linux/rmap.h b/include/linux/rmap.h
>> index 7229b9b..c5a0205 100644
>> --- a/include/linux/rmap.h
>> +++ b/include/linux/rmap.h
>> @@ -192,6 +192,7 @@ typedef int __bitwise rmap_t;
>>   enum rmap_level {
>>   	RMAP_LEVEL_PTE = 0,
>>   	RMAP_LEVEL_PMD,
>> +	RMAP_LEVEL_PUD,
>>   };
>>     static inline void __folio_rmap_sanity_checks(struct folio
>> *folio,
>> @@ -225,6 +226,13 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio,
>>   		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
>>   		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
>>   		break;
>> +	case RMAP_LEVEL_PUD:
>> +		/*
>> +		 * Asume that we are creating * a single "entire" mapping of the folio.
>> +		 */
>> +		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio);
>> +		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
>> +		break;
>>   	default:
>>   		VM_WARN_ON_ONCE(true);
>>   	}
>> @@ -248,12 +256,16 @@ void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
>>   	folio_add_file_rmap_ptes(folio, page, 1, vma)
>>   void folio_add_file_rmap_pmd(struct folio *, struct page *,
>>   		struct vm_area_struct *);
>> +void folio_add_file_rmap_pud(struct folio *, struct page *,
>> +		struct vm_area_struct *);
>>   void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
>>   		struct vm_area_struct *);
>>   #define folio_remove_rmap_pte(folio, page, vma) \
>>   	folio_remove_rmap_ptes(folio, page, 1, vma)
>>   void folio_remove_rmap_pmd(struct folio *, struct page *,
>>   		struct vm_area_struct *);
>> +void folio_remove_rmap_pud(struct folio *, struct page *,
>> +		struct vm_area_struct *);
>>     void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct
>> *,
>>   		unsigned long address, rmap_t flags);
>> @@ -338,6 +350,7 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio,
>>   		atomic_add(orig_nr_pages, &folio->_large_mapcount);
>>   		break;
>>   	case RMAP_LEVEL_PMD:
>> +	case RMAP_LEVEL_PUD:
>>   		atomic_inc(&folio->_entire_mapcount);
>>   		atomic_inc(&folio->_large_mapcount);
>>   		break;
>> @@ -434,6 +447,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
>>   		atomic_add(orig_nr_pages, &folio->_large_mapcount);
>>   		break;
>>   	case RMAP_LEVEL_PMD:
>> +	case RMAP_LEVEL_PUD:
>>   		if (PageAnonExclusive(page)) {
>>   			if (unlikely(maybe_pinned))
>>   				return -EBUSY;
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index db7946a..e1f053e 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -1283,6 +1283,70 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
>>   	return VM_FAULT_NOPAGE;
>>   }
>>   EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
>> +
>> +/**
>> + * dax_insert_pfn_pud - insert a pud size pfn backed by a normal page
>> + * @vmf: Structure describing the fault
>> + * @pfn: pfn of the page to insert
>> + * @write: whether it's a write fault
>> + *
>> + * Return: vm_fault_t value.
>> + */
>> +vm_fault_t dax_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
>> +{
>> +	struct vm_area_struct *vma = vmf->vma;
>> +	unsigned long addr = vmf->address & PUD_MASK;
>> +	pud_t *pud = vmf->pud;
>> +	pgprot_t prot = vma->vm_page_prot;
>> +	struct mm_struct *mm = vma->vm_mm;
>> +	pud_t entry;
>> +	spinlock_t *ptl;
>> +	struct folio *folio;
>> +	struct page *page;
>> +
>> +	if (addr < vma->vm_start || addr >= vma->vm_end)
>> +		return VM_FAULT_SIGBUS;
>> +
>> +	track_pfn_insert(vma, &prot, pfn);
>> +
>> +	ptl = pud_lock(mm, pud);
>> +	if (!pud_none(*pud)) {
>> +		if (write) {
>> +			if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
>> +				WARN_ON_ONCE(!is_huge_zero_pud(*pud));
>> +				goto out_unlock;
>> +			}
>> +			entry = pud_mkyoung(*pud);
>> +			entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
>> +			if (pudp_set_access_flags(vma, addr, pud, entry, 1))
>> +				update_mmu_cache_pud(vma, addr, pud);
>> +		}
>> +		goto out_unlock;
>> +	}
>> +
>> +	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
>> +	if (pfn_t_devmap(pfn))
>> +		entry = pud_mkdevmap(entry);
>> +	if (write) {
>> +		entry = pud_mkyoung(pud_mkdirty(entry));
>> +		entry = maybe_pud_mkwrite(entry, vma);
>> +	}
>> +
>> +	page = pfn_t_to_page(pfn);
>> +	folio = page_folio(page);
>> +	folio_get(folio);
>> +	folio_add_file_rmap_pud(folio, page, vma);
>> +	add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);
>> +
>> +	set_pud_at(mm, addr, pud, entry);
>> +	update_mmu_cache_pud(vma, addr, pud);
>> +
>> +out_unlock:
>> +	spin_unlock(ptl);
>> +
>> +	return VM_FAULT_NOPAGE;
>> +}
>> +EXPORT_SYMBOL_GPL(dax_insert_pfn_pud);
>>   #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
>>     void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
>> @@ -1836,7 +1900,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>   			zap_deposited_table(tlb->mm, pmd);
>>   		spin_unlock(ptl);
>>   	} else if (is_huge_zero_pmd(orig_pmd)) {
>> -		zap_deposited_table(tlb->mm, pmd);
>> +		if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
>> +			zap_deposited_table(tlb->mm, pmd);
>>   		spin_unlock(ptl);
>>   	} else {
>>   		struct folio *folio = NULL;
>> @@ -2268,20 +2333,34 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
>>   int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>   		 pud_t *pud, unsigned long addr)
>>   {
>> +	pud_t orig_pud;
>>   	spinlock_t *ptl;
>>     	ptl = __pud_trans_huge_lock(pud, vma);
>>   	if (!ptl)
>>   		return 0;
>>   -	pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
>> +	orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
>>   	tlb_remove_pud_tlb_entry(tlb, pud, addr);
>> -	if (vma_is_special_huge(vma)) {
>> +	if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
>>   		spin_unlock(ptl);
>>   		/* No zero page support yet */
>>   	} else {
>> -		/* No support for anonymous PUD pages yet */
>> -		BUG();
>> +		struct page *page = NULL;
>> +		struct folio *folio;
>> +
>> +		/* No support for anonymous PUD pages or migration yet */
>> +		BUG_ON(vma_is_anonymous(vma) || !pud_present(orig_pud));
>> +
>> +		page = pud_page(orig_pud);
>> +		folio = page_folio(page);
>> +		folio_remove_rmap_pud(folio, page, vma);
>> +		VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
>> +		VM_BUG_ON_PAGE(!PageHead(page), page);
>> +		add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);
>> +
>> +		spin_unlock(ptl);
>> +		tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
>>   	}
>>   	return 1;
>>   }
>> @@ -2289,6 +2368,8 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
>>   static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
>>   		unsigned long haddr)
>>   {
>> +	pud_t old_pud;
>> +
>>   	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
>>   	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
>>   	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
>> @@ -2296,7 +2377,22 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
>>     	count_vm_event(THP_SPLIT_PUD);
>>   -	pudp_huge_clear_flush(vma, haddr, pud);
>> +	old_pud = pudp_huge_clear_flush(vma, haddr, pud);
>> +	if (is_huge_zero_pud(old_pud))
>> +		return;
>> +
>> +	if (vma_is_dax(vma)) {
>> +		struct page *page = pud_page(old_pud);
>> +		struct folio *folio = page_folio(page);
>> +
>> +		if (!folio_test_dirty(folio) && pud_dirty(old_pud))
>> +			folio_mark_dirty(folio);
>> +		if (!folio_test_referenced(folio) && pud_young(old_pud))
>> +			folio_set_referenced(folio);
>> +		folio_remove_rmap_pud(folio, page, vma);
>> +		folio_put(folio);
>> +		add_mm_counter(vma->vm_mm, mm_counter_file(folio), -HPAGE_PUD_NR);
>> +	}
>>   }
>>     void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
>> diff --git a/mm/rmap.c b/mm/rmap.c
>> index e8fc5ec..e949e4f 100644
>> --- a/mm/rmap.c
>> +++ b/mm/rmap.c
>> @@ -1165,6 +1165,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
>>   		atomic_add(orig_nr_pages, &folio->_large_mapcount);
>>   		break;
>>   	case RMAP_LEVEL_PMD:
>> +	case RMAP_LEVEL_PUD:
>>   		first = atomic_inc_and_test(&folio->_entire_mapcount);
>>   		if (first) {
>>   			nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
>> @@ -1306,6 +1307,12 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio,
>>   		case RMAP_LEVEL_PMD:
>>   			SetPageAnonExclusive(page);
>>   			break;
>> +		case RMAP_LEVEL_PUD:
>> +			/*
>> +			 * Keep the compiler happy, we don't support anonymous PUD mappings.
>> +			 */
>> +			WARN_ON_ONCE(1);
>> +			break;
>>   		}
>>   	}
>>   	for (i = 0; i < nr_pages; i++) {
>> @@ -1489,6 +1496,26 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
>>   #endif
>>   }
>>   +/**
>> + * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
>> + * @folio:	The folio to add the mapping to
>> + * @page:	The first page to add
>> + * @vma:	The vm area in which the mapping is added
>> + *
>> + * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
>> + *
>> + * The caller needs to hold the page table lock.
>> + */
>> +void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
>> +		struct vm_area_struct *vma)
>> +{
>> +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>> +	__folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
>> +#else
>> +	WARN_ON_ONCE(true);
>> +#endif
>> +}
>> +
>>   static __always_inline void __folio_remove_rmap(struct folio *folio,
>>   		struct page *page, int nr_pages, struct vm_area_struct *vma,
>>   		enum rmap_level level)
>> @@ -1521,6 +1548,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
>>   		partially_mapped = nr && atomic_read(mapped);
>>   		break;
>>   	case RMAP_LEVEL_PMD:
>> +	case RMAP_LEVEL_PUD:
>>   		atomic_dec(&folio->_large_mapcount);
>>   		last = atomic_add_negative(-1, &folio->_entire_mapcount);
>>   		if (last) {
>> @@ -1615,6 +1643,26 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
>>   #endif
>>   }
>>   +/**
>> + * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio
>> + * @folio:	The folio to remove the mapping from
>> + * @page:	The first page to remove
>> + * @vma:	The vm area from which the mapping is removed
>> + *
>> + * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
>> + *
>> + * The caller needs to hold the page table lock.
>> + */
>> +void folio_remove_rmap_pud(struct folio *folio, struct page *page,
>> +		struct vm_area_struct *vma)
>> +{
>> +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>> +	__folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
>> +#else
>> +	WARN_ON_ONCE(true);
>> +#endif
>> +}
>> +
>>   /*
>>    * @arg: enum ttu_flags will be passed to this argument
>>    */
David Hildenbrand July 2, 2024, 11:02 a.m. UTC | #4
On 02.07.24 12:19, Alistair Popple wrote:
> 
> David Hildenbrand <david@redhat.com> writes:
> 
>> On 27.06.24 02:54, Alistair Popple wrote:
>>> Currently DAX folio/page reference counts are managed differently to
>>> normal pages. To allow these to be managed the same as normal pages
>>> introduce dax_insert_pfn_pud. This will map the entire PUD-sized folio
>>> and take references as it would for a normally mapped page.
>>> This is distinct from the current mechanism, vmf_insert_pfn_pud,
>>> which
>>> simply inserts a special devmap PUD entry into the page table without
>>> holding a reference to the page for the mapping.
>>
>> Do we really have to involve mapcounts/rmap for daxfs pages at this
>> point? Or is this only "to make it look more like other pages" ?
> 
> The aim of the series is make FS DAX and other ZONE_DEVICE pages look
> like other pages, at least with regards to the way they are refcounted.
> 
> At the moment they are not refcounted - instead their refcounts are
> basically statically initialised to one and there are all these special
> cases and functions requiring magic PTE bits (pXX_devmap) to do the
> special DAX reference counting. This then adds some cruft to manage
> pgmap references and to catch the 2->1 page refcount transition. All
> this just goes away if we manage the page references the same as other
> pages (and indeed we already manage DEVICE_PRIVATE and COHERENT pages
> the same as normal pages).
> 
> So I think to make this work we at least need the mapcounts.
> 

We only really need the mapcounts if we intend to do something like 
folio_mapcount() == folio_ref_count() to detect unexpected folio 
references, and if we have to have things like folio_mapped() working. 
For now that was not required, that's why I am asking.

Background also being that in a distant future folios will be decoupled 
more from other compound pages, and only folios (or "struct anon_folio" 
/ "struct file_folio") would even have mapcounts.

For example, most stuff we map (and refcount!) via vm_insert_page() 
really must stop involving mapcounts. These won't be "ordinary" 
mapcount-tracked folios in the future, they are simply some refcounted 
pages some ordinary driver allocated.

For FS-DAX, if we'll be using the same "struct file_folio" approach as 
for ordinary pageache memory, then this is the right thing to do here.


>> I'm asking this because:
>>
>> (A) We don't support mixing PUD+PMD mappings yet. I have plans to change
>>      that in the future, but for now you can only map using a single PUD
>>      or by PTEs. I suspect that's good enoug for now for dax fs?
> 
> Yep, that's all we support.
> 
>> (B) As long as we have subpage mapcounts, this prevents vmemmap
>>      optimizations [1]. Is that only used for device-dax for now and are
>>      there no plans to make use of that for fs-dax?
> 
> I don't have any plans to. This is purely focussed on refcounting pages
> "like normal" so we can get rid of all the DAX special casing.
> 
>> (C) We managed without so far :)
> 
> Indeed, although Christoph has asked repeatedly ([1], [2] and likely
> others) that this gets fixed and I finally got sick of it coming up
> everytime I need to touch something with ZONE_DEVICE pages :)
> 
> Also it removes the need for people to understand the special DAX page
> recounting scheme and ends up removing a bunch of cruft as a bonus:
> 
>   59 files changed, 485 insertions(+), 869 deletions(-)

I'm not challenging the refcounting scheme. I'm purely asking about 
mapcount handling, which is something related but different.

> 
> And that's before I clean up all the pgmap reference handling. It also
> removes the pXX_trans_huge and pXX_leaf distinction. So we managed, but
> things could be better IMHO.
> 

Again, all nice things.

>> Having that said, with folio->_large_mapcount things like
>> folio_mapcount() are no longer terribly slow once we weould PTE-map a
>> PUD-sized folio.
>>
>> Also, all ZONE_DEVICE pages should currently be marked PG_reserved,
>> translating to "don't touch the memmap". I think we might want to
>> tackle that first.

Missed to add a pointer to [2].

> 
> Ok. I'm keen to get this series finished and I don't quite get the
> connection here, what needs to change there?

include/linux/page-flags.h

"PG_reserved is set for special pages. The "struct page" of such a page 
should in general not be touched (e.g. set dirty) except by its owner. 
Pages marked as PG_reserved include:

...

- Device memory (e.g. PMEM, DAX, HMM)
"

I think we already entered that domain with other ZONE_DEVICE pages 
being returned from vm_normal_folio(), unfortunately. But that really 
must be cleaned up for these pages to not look special anymore.

Agreed that it likely is something that is not blocking this series.
Alistair Popple July 2, 2024, 11:30 a.m. UTC | #5
David Hildenbrand <david@redhat.com> writes:

> On 02.07.24 12:19, Alistair Popple wrote:
>> David Hildenbrand <david@redhat.com> writes:
>> 
>>> On 27.06.24 02:54, Alistair Popple wrote:
>>>> Currently DAX folio/page reference counts are managed differently to
>>>> normal pages. To allow these to be managed the same as normal pages
>>>> introduce dax_insert_pfn_pud. This will map the entire PUD-sized folio
>>>> and take references as it would for a normally mapped page.
>>>> This is distinct from the current mechanism, vmf_insert_pfn_pud,
>>>> which
>>>> simply inserts a special devmap PUD entry into the page table without
>>>> holding a reference to the page for the mapping.
>>>
>>> Do we really have to involve mapcounts/rmap for daxfs pages at this
>>> point? Or is this only "to make it look more like other pages" ?
>> The aim of the series is make FS DAX and other ZONE_DEVICE pages
>> look
>> like other pages, at least with regards to the way they are refcounted.
>> At the moment they are not refcounted - instead their refcounts are
>> basically statically initialised to one and there are all these special
>> cases and functions requiring magic PTE bits (pXX_devmap) to do the
>> special DAX reference counting. This then adds some cruft to manage
>> pgmap references and to catch the 2->1 page refcount transition. All
>> this just goes away if we manage the page references the same as other
>> pages (and indeed we already manage DEVICE_PRIVATE and COHERENT pages
>> the same as normal pages).
>> So I think to make this work we at least need the mapcounts.
>> 
>
> We only really need the mapcounts if we intend to do something like
> folio_mapcount() == folio_ref_count() to detect unexpected folio
> references, and if we have to have things like folio_mapped()
> working. For now that was not required, that's why I am asking.

Oh I see, thanks for pointing that out. In that case I agree, we don't
need the mapcounts. As you say we don't currently need to detect
unexpect references for FS DAX and this series doesn't seek to introduce
any new behviour/features.

> Background also being that in a distant future folios will be
> decoupled more from other compound pages, and only folios (or "struct
> anon_folio" / "struct file_folio") would even have mapcounts.
>
> For example, most stuff we map (and refcount!) via vm_insert_page()
> really must stop involving mapcounts. These won't be "ordinary"
> mapcount-tracked folios in the future, they are simply some refcounted
> pages some ordinary driver allocated.

Ok, so for FS DAX we should take a reference on the folio for the
mapping but not a mapcount?

> For FS-DAX, if we'll be using the same "struct file_folio" approach as
> for ordinary pageache memory, then this is the right thing to do here.
>
>
>>> I'm asking this because:
>>>
>>> (A) We don't support mixing PUD+PMD mappings yet. I have plans to change
>>>      that in the future, but for now you can only map using a single PUD
>>>      or by PTEs. I suspect that's good enoug for now for dax fs?
>> Yep, that's all we support.
>> 
>>> (B) As long as we have subpage mapcounts, this prevents vmemmap
>>>      optimizations [1]. Is that only used for device-dax for now and are
>>>      there no plans to make use of that for fs-dax?
>> I don't have any plans to. This is purely focussed on refcounting
>> pages
>> "like normal" so we can get rid of all the DAX special casing.
>> 
>>> (C) We managed without so far :)
>> Indeed, although Christoph has asked repeatedly ([1], [2] and likely
>> others) that this gets fixed and I finally got sick of it coming up
>> everytime I need to touch something with ZONE_DEVICE pages :)
>> Also it removes the need for people to understand the special DAX
>> page
>> recounting scheme and ends up removing a bunch of cruft as a bonus:
>>   59 files changed, 485 insertions(+), 869 deletions(-)
>
> I'm not challenging the refcounting scheme. I'm purely asking about
> mapcount handling, which is something related but different.

Got it, thanks. I hadn't quite picked up on the mapcount vs. refcount
distinction you were asking about.

>> And that's before I clean up all the pgmap reference handling. It
>> also
>> removes the pXX_trans_huge and pXX_leaf distinction. So we managed, but
>> things could be better IMHO.
>> 
>
> Again, all nice things.
>
>>> Having that said, with folio->_large_mapcount things like
>>> folio_mapcount() are no longer terribly slow once we weould PTE-map a
>>> PUD-sized folio.
>>>
>>> Also, all ZONE_DEVICE pages should currently be marked PG_reserved,
>>> translating to "don't touch the memmap". I think we might want to
>>> tackle that first.
>
> Missed to add a pointer to [2].
>
>> Ok. I'm keen to get this series finished and I don't quite get the
>> connection here, what needs to change there?
>
> include/linux/page-flags.h
>
> "PG_reserved is set for special pages. The "struct page" of such a
> page should in general not be touched (e.g. set dirty) except by its
> owner. Pages marked as PG_reserved include:
>
> ...
>
> - Device memory (e.g. PMEM, DAX, HMM)
> "
>
> I think we already entered that domain with other ZONE_DEVICE pages
> being returned from vm_normal_folio(), unfortunately. But that really
> must be cleaned up for these pages to not look special anymore.
>
> Agreed that it likely is something that is not blocking this series.

Great. I'd like to see that cleaned up a little too or at least made
more understandable. The first time I looked at this it took a
suprising amount of time to figure out what constituted a "normal"
page so I'd be happy to help. This series does, in a small way, clean
that up by removing the pte_devmap special case.
hch@lst.de July 2, 2024, 11:51 a.m. UTC | #6
On Tue, Jul 02, 2024 at 08:19:01PM +1000, Alistair Popple wrote:
> > (B) As long as we have subpage mapcounts, this prevents vmemmap
> >     optimizations [1]. Is that only used for device-dax for now and are
> >     there no plans to make use of that for fs-dax?
> 
> I don't have any plans to. This is purely focussed on refcounting pages
> "like normal" so we can get rid of all the DAX special casing.
> 
> > (C) We managed without so far :)
> 
> Indeed, although Christoph has asked repeatedly ([1], [2] and likely
> others) that this gets fixed and I finally got sick of it coming up
> everytime I need to touch something with ZONE_DEVICE pages :)
> 
> Also it removes the need for people to understand the special DAX page
> recounting scheme and ends up removing a bunch of cruft as a bonus:
> 
>  59 files changed, 485 insertions(+), 869 deletions(-)
> 
> And that's before I clean up all the pgmap reference handling. It also
> removes the pXX_trans_huge and pXX_leaf distinction. So we managed, but
> things could be better IMHO.

Yes.  I can't wait for this series making the finish line.  There might
be more chance for cleanups and optimizations around ZONE_DEVICE, but
this alone is a huge step forward.
David Hildenbrand July 2, 2024, 1:01 p.m. UTC | #7
On 02.07.24 13:30, Alistair Popple wrote:
> 
> David Hildenbrand <david@redhat.com> writes:
> 
>> On 02.07.24 12:19, Alistair Popple wrote:
>>> David Hildenbrand <david@redhat.com> writes:
>>>
>>>> On 27.06.24 02:54, Alistair Popple wrote:
>>>>> Currently DAX folio/page reference counts are managed differently to
>>>>> normal pages. To allow these to be managed the same as normal pages
>>>>> introduce dax_insert_pfn_pud. This will map the entire PUD-sized folio
>>>>> and take references as it would for a normally mapped page.
>>>>> This is distinct from the current mechanism, vmf_insert_pfn_pud,
>>>>> which
>>>>> simply inserts a special devmap PUD entry into the page table without
>>>>> holding a reference to the page for the mapping.
>>>>
>>>> Do we really have to involve mapcounts/rmap for daxfs pages at this
>>>> point? Or is this only "to make it look more like other pages" ?
>>> The aim of the series is make FS DAX and other ZONE_DEVICE pages
>>> look
>>> like other pages, at least with regards to the way they are refcounted.
>>> At the moment they are not refcounted - instead their refcounts are
>>> basically statically initialised to one and there are all these special
>>> cases and functions requiring magic PTE bits (pXX_devmap) to do the
>>> special DAX reference counting. This then adds some cruft to manage
>>> pgmap references and to catch the 2->1 page refcount transition. All
>>> this just goes away if we manage the page references the same as other
>>> pages (and indeed we already manage DEVICE_PRIVATE and COHERENT pages
>>> the same as normal pages).
>>> So I think to make this work we at least need the mapcounts.
>>>
>>
>> We only really need the mapcounts if we intend to do something like
>> folio_mapcount() == folio_ref_count() to detect unexpected folio
>> references, and if we have to have things like folio_mapped()
>> working. For now that was not required, that's why I am asking.
> 
> Oh I see, thanks for pointing that out. In that case I agree, we don't
> need the mapcounts. As you say we don't currently need to detect
> unexpect references for FS DAX and this series doesn't seek to introduce
> any new behviour/features.
> 
>> Background also being that in a distant future folios will be
>> decoupled more from other compound pages, and only folios (or "struct
>> anon_folio" / "struct file_folio") would even have mapcounts.
>>
>> For example, most stuff we map (and refcount!) via vm_insert_page()
>> really must stop involving mapcounts. These won't be "ordinary"
>> mapcount-tracked folios in the future, they are simply some refcounted
>> pages some ordinary driver allocated.
> 
> Ok, so for FS DAX we should take a reference on the folio for the
> mapping but not a mapcount?

That's what we could do, yes. But if we really just want to track page 
table mappings like we do with other folios (anon/pagecache/shmem), and 
want to keep doing that in the future, then maybe we should just do it 
right away. The rmap code you add will be required in the future either way.

Using that code for device dax would likely be impossible right now due 
to the vmemmap optimizations I guess, if we would end up writing subpage 
mapcounts. But for FSDAX purposes (no vmemmap optimization) it would work.
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2aa986a..b98a3cc 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -39,6 +39,7 @@  int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
+vm_fault_t dax_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
 
 enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_UNSUPPORTED,
@@ -106,6 +107,9 @@  extern struct kobj_attribute shmem_enabled_attr;
 #define HPAGE_PUD_MASK	(~(HPAGE_PUD_SIZE - 1))
 #define HPAGE_PUD_SIZE	((1UL) << HPAGE_PUD_SHIFT)
 
+#define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT)
+#define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER)
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 
 extern unsigned long transparent_hugepage_flags;
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 7229b9b..c5a0205 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -192,6 +192,7 @@  typedef int __bitwise rmap_t;
 enum rmap_level {
 	RMAP_LEVEL_PTE = 0,
 	RMAP_LEVEL_PMD,
+	RMAP_LEVEL_PUD,
 };
 
 static inline void __folio_rmap_sanity_checks(struct folio *folio,
@@ -225,6 +226,13 @@  static inline void __folio_rmap_sanity_checks(struct folio *folio,
 		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
 		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
 		break;
+	case RMAP_LEVEL_PUD:
+		/*
+		 * Asume that we are creating * a single "entire" mapping of the folio.
+		 */
+		VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio);
+		VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
+		break;
 	default:
 		VM_WARN_ON_ONCE(true);
 	}
@@ -248,12 +256,16 @@  void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
 	folio_add_file_rmap_ptes(folio, page, 1, vma)
 void folio_add_file_rmap_pmd(struct folio *, struct page *,
 		struct vm_area_struct *);
+void folio_add_file_rmap_pud(struct folio *, struct page *,
+		struct vm_area_struct *);
 void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
 		struct vm_area_struct *);
 #define folio_remove_rmap_pte(folio, page, vma) \
 	folio_remove_rmap_ptes(folio, page, 1, vma)
 void folio_remove_rmap_pmd(struct folio *, struct page *,
 		struct vm_area_struct *);
+void folio_remove_rmap_pud(struct folio *, struct page *,
+		struct vm_area_struct *);
 
 void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
@@ -338,6 +350,7 @@  static __always_inline void __folio_dup_file_rmap(struct folio *folio,
 		atomic_add(orig_nr_pages, &folio->_large_mapcount);
 		break;
 	case RMAP_LEVEL_PMD:
+	case RMAP_LEVEL_PUD:
 		atomic_inc(&folio->_entire_mapcount);
 		atomic_inc(&folio->_large_mapcount);
 		break;
@@ -434,6 +447,7 @@  static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
 		atomic_add(orig_nr_pages, &folio->_large_mapcount);
 		break;
 	case RMAP_LEVEL_PMD:
+	case RMAP_LEVEL_PUD:
 		if (PageAnonExclusive(page)) {
 			if (unlikely(maybe_pinned))
 				return -EBUSY;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index db7946a..e1f053e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1283,6 +1283,70 @@  vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
 	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
+
+/**
+ * dax_insert_pfn_pud - insert a pud size pfn backed by a normal page
+ * @vmf: Structure describing the fault
+ * @pfn: pfn of the page to insert
+ * @write: whether it's a write fault
+ *
+ * Return: vm_fault_t value.
+ */
+vm_fault_t dax_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	unsigned long addr = vmf->address & PUD_MASK;
+	pud_t *pud = vmf->pud;
+	pgprot_t prot = vma->vm_page_prot;
+	struct mm_struct *mm = vma->vm_mm;
+	pud_t entry;
+	spinlock_t *ptl;
+	struct folio *folio;
+	struct page *page;
+
+	if (addr < vma->vm_start || addr >= vma->vm_end)
+		return VM_FAULT_SIGBUS;
+
+	track_pfn_insert(vma, &prot, pfn);
+
+	ptl = pud_lock(mm, pud);
+	if (!pud_none(*pud)) {
+		if (write) {
+			if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
+				WARN_ON_ONCE(!is_huge_zero_pud(*pud));
+				goto out_unlock;
+			}
+			entry = pud_mkyoung(*pud);
+			entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
+			if (pudp_set_access_flags(vma, addr, pud, entry, 1))
+				update_mmu_cache_pud(vma, addr, pud);
+		}
+		goto out_unlock;
+	}
+
+	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
+	if (pfn_t_devmap(pfn))
+		entry = pud_mkdevmap(entry);
+	if (write) {
+		entry = pud_mkyoung(pud_mkdirty(entry));
+		entry = maybe_pud_mkwrite(entry, vma);
+	}
+
+	page = pfn_t_to_page(pfn);
+	folio = page_folio(page);
+	folio_get(folio);
+	folio_add_file_rmap_pud(folio, page, vma);
+	add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);
+
+	set_pud_at(mm, addr, pud, entry);
+	update_mmu_cache_pud(vma, addr, pud);
+
+out_unlock:
+	spin_unlock(ptl);
+
+	return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(dax_insert_pfn_pud);
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
 void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1836,7 +1900,8 @@  int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			zap_deposited_table(tlb->mm, pmd);
 		spin_unlock(ptl);
 	} else if (is_huge_zero_pmd(orig_pmd)) {
-		zap_deposited_table(tlb->mm, pmd);
+		if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
+			zap_deposited_table(tlb->mm, pmd);
 		spin_unlock(ptl);
 	} else {
 		struct folio *folio = NULL;
@@ -2268,20 +2333,34 @@  spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
 int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		 pud_t *pud, unsigned long addr)
 {
+	pud_t orig_pud;
 	spinlock_t *ptl;
 
 	ptl = __pud_trans_huge_lock(pud, vma);
 	if (!ptl)
 		return 0;
 
-	pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
+	orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
 	tlb_remove_pud_tlb_entry(tlb, pud, addr);
-	if (vma_is_special_huge(vma)) {
+	if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
 		spin_unlock(ptl);
 		/* No zero page support yet */
 	} else {
-		/* No support for anonymous PUD pages yet */
-		BUG();
+		struct page *page = NULL;
+		struct folio *folio;
+
+		/* No support for anonymous PUD pages or migration yet */
+		BUG_ON(vma_is_anonymous(vma) || !pud_present(orig_pud));
+
+		page = pud_page(orig_pud);
+		folio = page_folio(page);
+		folio_remove_rmap_pud(folio, page, vma);
+		VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
+		VM_BUG_ON_PAGE(!PageHead(page), page);
+		add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);
+
+		spin_unlock(ptl);
+		tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
 	}
 	return 1;
 }
@@ -2289,6 +2368,8 @@  int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
 static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
 		unsigned long haddr)
 {
+	pud_t old_pud;
+
 	VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
 	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
 	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
@@ -2296,7 +2377,22 @@  static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
 
 	count_vm_event(THP_SPLIT_PUD);
 
-	pudp_huge_clear_flush(vma, haddr, pud);
+	old_pud = pudp_huge_clear_flush(vma, haddr, pud);
+	if (is_huge_zero_pud(old_pud))
+		return;
+
+	if (vma_is_dax(vma)) {
+		struct page *page = pud_page(old_pud);
+		struct folio *folio = page_folio(page);
+
+		if (!folio_test_dirty(folio) && pud_dirty(old_pud))
+			folio_mark_dirty(folio);
+		if (!folio_test_referenced(folio) && pud_young(old_pud))
+			folio_set_referenced(folio);
+		folio_remove_rmap_pud(folio, page, vma);
+		folio_put(folio);
+		add_mm_counter(vma->vm_mm, mm_counter_file(folio), -HPAGE_PUD_NR);
+	}
 }
 
 void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
diff --git a/mm/rmap.c b/mm/rmap.c
index e8fc5ec..e949e4f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1165,6 +1165,7 @@  static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
 		atomic_add(orig_nr_pages, &folio->_large_mapcount);
 		break;
 	case RMAP_LEVEL_PMD:
+	case RMAP_LEVEL_PUD:
 		first = atomic_inc_and_test(&folio->_entire_mapcount);
 		if (first) {
 			nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
@@ -1306,6 +1307,12 @@  static __always_inline void __folio_add_anon_rmap(struct folio *folio,
 		case RMAP_LEVEL_PMD:
 			SetPageAnonExclusive(page);
 			break;
+		case RMAP_LEVEL_PUD:
+			/*
+			 * Keep the compiler happy, we don't support anonymous PUD mappings.
+			 */
+			WARN_ON_ONCE(1);
+			break;
 		}
 	}
 	for (i = 0; i < nr_pages; i++) {
@@ -1489,6 +1496,26 @@  void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
 #endif
 }
 
+/**
+ * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
+ * @folio:	The folio to add the mapping to
+ * @page:	The first page to add
+ * @vma:	The vm area in which the mapping is added
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
+		struct vm_area_struct *vma)
+{
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+	__folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+#else
+	WARN_ON_ONCE(true);
+#endif
+}
+
 static __always_inline void __folio_remove_rmap(struct folio *folio,
 		struct page *page, int nr_pages, struct vm_area_struct *vma,
 		enum rmap_level level)
@@ -1521,6 +1548,7 @@  static __always_inline void __folio_remove_rmap(struct folio *folio,
 		partially_mapped = nr && atomic_read(mapped);
 		break;
 	case RMAP_LEVEL_PMD:
+	case RMAP_LEVEL_PUD:
 		atomic_dec(&folio->_large_mapcount);
 		last = atomic_add_negative(-1, &folio->_entire_mapcount);
 		if (last) {
@@ -1615,6 +1643,26 @@  void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
 #endif
 }
 
+/**
+ * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio
+ * @folio:	The folio to remove the mapping from
+ * @page:	The first page to remove
+ * @vma:	The vm area from which the mapping is removed
+ *
+ * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
+ *
+ * The caller needs to hold the page table lock.
+ */
+void folio_remove_rmap_pud(struct folio *folio, struct page *page,
+		struct vm_area_struct *vma)
+{
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+	__folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
+#else
+	WARN_ON_ONCE(true);
+#endif
+}
+
 /*
  * @arg: enum ttu_flags will be passed to this argument
  */