diff mbox series

hugetlb: simplify hugetlb handling in follow_page_mask

Message ID 20220829234053.159158-1-mike.kravetz@oracle.com (mailing list archive)
State New
Headers show
Series hugetlb: simplify hugetlb handling in follow_page_mask | expand

Commit Message

Mike Kravetz Aug. 29, 2022, 11:40 p.m. UTC
During discussions of this series [1], it was suggested that hugetlb
handling code in follow_page_mask could be simplified.  At the beginning
of follow_page_mask, there currently is a call to follow_huge_addr which
'may' handle hugetlb pages.  ia64 is the only architecture which provides
a follow_huge_addr routine that does not return error.  Instead, at each
level of the page table a check is made for a hugetlb entry.  If a hugetlb
entry is found, a call to a routine associated with that entry is made.

Currently, there are two checks for hugetlb entries at each page table
level.  The first check is of the form:
	if (p?d_huge())
		page = follow_huge_p?d();
the second check is of the form:
	if (is_hugepd())
		page = follow_huge_pd().

We can replace these checks, as well as the special handling routines
such as follow_huge_p?d() and follow_huge_pd() with a single routine to
handle hugetlb vmas.

A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
existing routine huge_pte_offset to walk page tables looking for hugetlb
entries.  huge_pte_offset can be overwritten by architectures, and already
handles special cases such as hugepd entries.

[1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/
Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 arch/ia64/mm/hugetlbpage.c    |  15 ---
 arch/powerpc/mm/hugetlbpage.c |  37 --------
 include/linux/hugetlb.h       |  51 ++--------
 mm/gup.c                      |  65 ++-----------
 mm/hugetlb.c                  | 173 +++++++++++-----------------------
 5 files changed, 74 insertions(+), 267 deletions(-)

Comments

Baolin Wang Aug. 30, 2022, 1:06 a.m. UTC | #1
Hi Mike,

On 8/30/2022 7:40 AM, Mike Kravetz wrote:
> During discussions of this series [1], it was suggested that hugetlb
> handling code in follow_page_mask could be simplified.  At the beginning
> of follow_page_mask, there currently is a call to follow_huge_addr which
> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> a follow_huge_addr routine that does not return error.  Instead, at each
> level of the page table a check is made for a hugetlb entry.  If a hugetlb
> entry is found, a call to a routine associated with that entry is made.
> 
> Currently, there are two checks for hugetlb entries at each page table
> level.  The first check is of the form:
> 	if (p?d_huge())
> 		page = follow_huge_p?d();
> the second check is of the form:
> 	if (is_hugepd())
> 		page = follow_huge_pd().
> 
> We can replace these checks, as well as the special handling routines
> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> handle hugetlb vmas.
> 
> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> existing routine huge_pte_offset to walk page tables looking for hugetlb
> entries.  huge_pte_offset can be overwritten by architectures, and already
> handles special cases such as hugepd entries.

Could you also mention that this patch will fix the lock issue for 
CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help 
people to understand the issue.

Otherwise the changes look good to me.
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>

> 
> [1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>   arch/ia64/mm/hugetlbpage.c    |  15 ---
>   arch/powerpc/mm/hugetlbpage.c |  37 --------
>   include/linux/hugetlb.h       |  51 ++--------
>   mm/gup.c                      |  65 ++-----------
>   mm/hugetlb.c                  | 173 +++++++++++-----------------------
>   5 files changed, 74 insertions(+), 267 deletions(-)
> 
> diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
> index f993cb36c062..380d2f3966c9 100644
> --- a/arch/ia64/mm/hugetlbpage.c
> +++ b/arch/ia64/mm/hugetlbpage.c
> @@ -91,21 +91,6 @@ int prepare_hugepage_range(struct file *file,
>   	return 0;
>   }
>   
> -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
> -{
> -	struct page *page;
> -	pte_t *ptep;
> -
> -	if (REGION_NUMBER(addr) != RGN_HPAGE)
> -		return ERR_PTR(-EINVAL);
> -
> -	ptep = huge_pte_offset(mm, addr, HPAGE_SIZE);
> -	if (!ptep || pte_none(*ptep))
> -		return NULL;
> -	page = pte_page(*ptep);
> -	page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
> -	return page;
> -}
>   int pmd_huge(pmd_t pmd)
>   {
>   	return 0;
> diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
> index bc84a594ca62..b0e037c75c12 100644
> --- a/arch/powerpc/mm/hugetlbpage.c
> +++ b/arch/powerpc/mm/hugetlbpage.c
> @@ -506,43 +506,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
>   	} while (addr = next, addr != end);
>   }
>   
> -struct page *follow_huge_pd(struct vm_area_struct *vma,
> -			    unsigned long address, hugepd_t hpd,
> -			    int flags, int pdshift)
> -{
> -	pte_t *ptep;
> -	spinlock_t *ptl;
> -	struct page *page = NULL;
> -	unsigned long mask;
> -	int shift = hugepd_shift(hpd);
> -	struct mm_struct *mm = vma->vm_mm;
> -
> -retry:
> -	/*
> -	 * hugepage directory entries are protected by mm->page_table_lock
> -	 * Use this instead of huge_pte_lockptr
> -	 */
> -	ptl = &mm->page_table_lock;
> -	spin_lock(ptl);
> -
> -	ptep = hugepte_offset(hpd, address, pdshift);
> -	if (pte_present(*ptep)) {
> -		mask = (1UL << shift) - 1;
> -		page = pte_page(*ptep);
> -		page += ((address & mask) >> PAGE_SHIFT);
> -		if (flags & FOLL_GET)
> -			get_page(page);
> -	} else {
> -		if (is_hugetlb_entry_migration(*ptep)) {
> -			spin_unlock(ptl);
> -			__migration_entry_wait(mm, ptep, ptl);
> -			goto retry;
> -		}
> -	}
> -	spin_unlock(ptl);
> -	return page;
> -}
> -
>   bool __init arch_hugetlb_valid_size(unsigned long size)
>   {
>   	int shift = __ffs(size);
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 852f911d676e..8ea3e5e726e4 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -142,6 +142,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
>   			     unsigned long len);
>   int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
>   			    struct vm_area_struct *, struct vm_area_struct *);
> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags);
>   long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
>   			 struct page **, struct vm_area_struct **,
>   			 unsigned long *, unsigned long *, long, unsigned int,
> @@ -202,17 +204,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
>   				unsigned long addr, pte_t *ptep);
>   void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
>   				unsigned long *start, unsigned long *end);
> -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
> -			      int write);
> -struct page *follow_huge_pd(struct vm_area_struct *vma,
> -			    unsigned long address, hugepd_t hpd,
> -			    int flags, int pdshift);
> -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
> -				pmd_t *pmd, int flags);
> -struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
> -				pud_t *pud, int flags);
> -struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
> -			     pgd_t *pgd, int flags);
>   
>   void hugetlb_vma_lock_read(struct vm_area_struct *vma);
>   void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
> @@ -264,6 +255,13 @@ static inline void adjust_range_if_pmd_sharing_possible(
>   {
>   }
>   
> +static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	/* should never happen, but do not want to BUG */
> +	return ERR_PTR(-EINVAL);
> +}
> +
>   static inline long follow_hugetlb_page(struct mm_struct *mm,
>   			struct vm_area_struct *vma, struct page **pages,
>   			struct vm_area_struct **vmas, unsigned long *position,
> @@ -274,12 +272,6 @@ static inline long follow_hugetlb_page(struct mm_struct *mm,
>   	return 0;
>   }
>   
> -static inline struct page *follow_huge_addr(struct mm_struct *mm,
> -					unsigned long address, int write)
> -{
> -	return ERR_PTR(-EINVAL);
> -}
> -
>   static inline int copy_hugetlb_page_range(struct mm_struct *dst,
>   					  struct mm_struct *src,
>   					  struct vm_area_struct *dst_vma,
> @@ -312,31 +304,6 @@ static inline void hugetlb_show_meminfo_node(int nid)
>   {
>   }
>   
> -static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
> -				unsigned long address, hugepd_t hpd, int flags,
> -				int pdshift)
> -{
> -	return NULL;
> -}
> -
> -static inline struct page *follow_huge_pmd(struct mm_struct *mm,
> -				unsigned long address, pmd_t *pmd, int flags)
> -{
> -	return NULL;
> -}
> -
> -static inline struct page *follow_huge_pud(struct mm_struct *mm,
> -				unsigned long address, pud_t *pud, int flags)
> -{
> -	return NULL;
> -}
> -
> -static inline struct page *follow_huge_pgd(struct mm_struct *mm,
> -				unsigned long address, pgd_t *pgd, int flags)
> -{
> -	return NULL;
> -}
> -
>   static inline int prepare_hugepage_range(struct file *file,
>   				unsigned long addr, unsigned long len)
>   {
> diff --git a/mm/gup.c b/mm/gup.c
> index 66d8619e02ad..80ce04a5bae5 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -661,20 +661,6 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>   	pmdval = READ_ONCE(*pmd);
>   	if (pmd_none(pmdval))
>   		return no_page_table(vma, flags);
> -	if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
> -		page = follow_huge_pmd(mm, address, pmd, flags);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -	if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
> -		page = follow_huge_pd(vma, address,
> -				      __hugepd(pmd_val(pmdval)), flags,
> -				      PMD_SHIFT);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
>   retry:
>   	if (!pmd_present(pmdval)) {
>   		/*
> @@ -764,20 +750,6 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
>   	pud = pud_offset(p4dp, address);
>   	if (pud_none(*pud))
>   		return no_page_table(vma, flags);
> -	if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
> -		page = follow_huge_pud(mm, address, pud, flags);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -	if (is_hugepd(__hugepd(pud_val(*pud)))) {
> -		page = follow_huge_pd(vma, address,
> -				      __hugepd(pud_val(*pud)), flags,
> -				      PUD_SHIFT);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
>   	if (pud_devmap(*pud)) {
>   		ptl = pud_lock(mm, pud);
>   		page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
> @@ -797,7 +769,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
>   				    struct follow_page_context *ctx)
>   {
>   	p4d_t *p4d;
> -	struct page *page;
>   
>   	p4d = p4d_offset(pgdp, address);
>   	if (p4d_none(*p4d))
> @@ -806,14 +777,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma,
>   	if (unlikely(p4d_bad(*p4d)))
>   		return no_page_table(vma, flags);
>   
> -	if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
> -		page = follow_huge_pd(vma, address,
> -				      __hugepd(p4d_val(*p4d)), flags,
> -				      P4D_SHIFT);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
>   	return follow_pud_mask(vma, address, p4d, flags, ctx);
>   }
>   
> @@ -851,10 +814,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
>   
>   	ctx->page_mask = 0;
>   
> -	/* make this handle hugepd */
> -	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
> -	if (!IS_ERR(page)) {
> -		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
> +	/*
> +	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
> +	 * special hugetlb page table walking code.  This eliminates the
> +	 * need to check for hugetlb entries in the general walking code.
> +	 */
> +	if (is_vm_hugetlb_page(vma)) {
> +		page = hugetlb_follow_page_mask(vma, address, flags);
> +		if (!page)
> +			page = no_page_table(vma, flags);
>   		return page;
>   	}
>   
> @@ -863,21 +831,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
>   	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
>   		return no_page_table(vma, flags);
>   
> -	if (pgd_huge(*pgd)) {
> -		page = follow_huge_pgd(mm, address, pgd, flags);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
> -		page = follow_huge_pd(vma, address,
> -				      __hugepd(pgd_val(*pgd)), flags,
> -				      PGDIR_SHIFT);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -
>   	return follow_p4d_mask(vma, address, pgd, flags, ctx);
>   }
>   
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d0617d64d718..b3da421ba5be 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6190,6 +6190,62 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
>   	return false;
>   }
>   
> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	struct hstate *h = hstate_vma(vma);
> +	struct mm_struct *mm = vma->vm_mm;
> +	unsigned long haddr = address & huge_page_mask(h);
> +	struct page *page = NULL;
> +	spinlock_t *ptl;
> +	pte_t *pte, entry;
> +
> +	/*
> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> +	 * follow_hugetlb_page().
> +	 */
> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
> +		return NULL;
> +
> +	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
> +	if (!pte)
> +		return NULL;
> +
> +retry:
> +	ptl = huge_pte_lock(h, mm, pte);
> +	entry = huge_ptep_get(pte);
> +	if (pte_present(entry)) {
> +		page = pte_page(entry) +
> +				((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
> +		/*
> +		 * Note that page may be a sub-page, and with vmemmap
> +		 * optimizations the page struct may be read only.
> +		 * try_grab_page() will increase the ref count on the
> +		 * head page, so this will be OK.
> +		 *
> +		 * try_grab_page() should always succeed here, because we hold
> +		 * the ptl lock and have verified pte_present().
> +		 */
> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> +			page = NULL;
> +			goto out;
> +		}
> +	} else {
> +		if (is_hugetlb_entry_migration(entry)) {
> +			spin_unlock(ptl);
> +			__migration_entry_wait_huge(pte, ptl);
> +			goto retry;
> +		}
> +		/*
> +		 * hwpoisoned entry is treated as no_page_table in
> +		 * follow_page_mask().
> +		 */
> +	}
> +out:
> +	spin_unlock(ptl);
> +	return page;
> +}
> +
>   long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>   			 struct page **pages, struct vm_area_struct **vmas,
>   			 unsigned long *position, unsigned long *nr_pages,
> @@ -7140,123 +7196,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
>    * These functions are overwritable if your architecture needs its own
>    * behavior.
>    */
> -struct page * __weak
> -follow_huge_addr(struct mm_struct *mm, unsigned long address,
> -			      int write)
> -{
> -	return ERR_PTR(-EINVAL);
> -}
> -
> -struct page * __weak
> -follow_huge_pd(struct vm_area_struct *vma,
> -	       unsigned long address, hugepd_t hpd, int flags, int pdshift)
> -{
> -	WARN(1, "hugepd follow called with no support for hugepage directory format\n");
> -	return NULL;
> -}
> -
> -struct page * __weak
> -follow_huge_pmd(struct mm_struct *mm, unsigned long address,
> -		pmd_t *pmd, int flags)
> -{
> -	struct page *page = NULL;
> -	spinlock_t *ptl;
> -	pte_t pte;
> -
> -	/*
> -	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> -	 * follow_hugetlb_page().
> -	 */
> -	if (WARN_ON_ONCE(flags & FOLL_PIN))
> -		return NULL;
> -
> -retry:
> -	ptl = pmd_lockptr(mm, pmd);
> -	spin_lock(ptl);
> -	/*
> -	 * make sure that the address range covered by this pmd is not
> -	 * unmapped from other threads.
> -	 */
> -	if (!pmd_huge(*pmd))
> -		goto out;
> -	pte = huge_ptep_get((pte_t *)pmd);
> -	if (pte_present(pte)) {
> -		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
> -		/*
> -		 * try_grab_page() should always succeed here, because: a) we
> -		 * hold the pmd (ptl) lock, and b) we've just checked that the
> -		 * huge pmd (head) page is present in the page tables. The ptl
> -		 * prevents the head page and tail pages from being rearranged
> -		 * in any way. So this page must be available at this point,
> -		 * unless the page refcount overflowed:
> -		 */
> -		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> -			page = NULL;
> -			goto out;
> -		}
> -	} else {
> -		if (is_hugetlb_entry_migration(pte)) {
> -			spin_unlock(ptl);
> -			__migration_entry_wait_huge((pte_t *)pmd, ptl);
> -			goto retry;
> -		}
> -		/*
> -		 * hwpoisoned entry is treated as no_page_table in
> -		 * follow_page_mask().
> -		 */
> -	}
> -out:
> -	spin_unlock(ptl);
> -	return page;
> -}
> -
> -struct page * __weak
> -follow_huge_pud(struct mm_struct *mm, unsigned long address,
> -		pud_t *pud, int flags)
> -{
> -	struct page *page = NULL;
> -	spinlock_t *ptl;
> -	pte_t pte;
> -
> -	if (WARN_ON_ONCE(flags & FOLL_PIN))
> -		return NULL;
> -
> -retry:
> -	ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
> -	if (!pud_huge(*pud))
> -		goto out;
> -	pte = huge_ptep_get((pte_t *)pud);
> -	if (pte_present(pte)) {
> -		page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
> -		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> -			page = NULL;
> -			goto out;
> -		}
> -	} else {
> -		if (is_hugetlb_entry_migration(pte)) {
> -			spin_unlock(ptl);
> -			__migration_entry_wait(mm, (pte_t *)pud, ptl);
> -			goto retry;
> -		}
> -		/*
> -		 * hwpoisoned entry is treated as no_page_table in
> -		 * follow_page_mask().
> -		 */
> -	}
> -out:
> -	spin_unlock(ptl);
> -	return page;
> -}
> -
> -struct page * __weak
> -follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
> -{
> -	if (flags & (FOLL_GET | FOLL_PIN))
> -		return NULL;
> -
> -	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
> -}
> -
>   int isolate_hugetlb(struct page *page, struct list_head *list)
>   {
>   	int ret = 0;
David Hildenbrand Aug. 30, 2022, 8:11 a.m. UTC | #2
On 30.08.22 01:40, Mike Kravetz wrote:
> During discussions of this series [1], it was suggested that hugetlb
> handling code in follow_page_mask could be simplified.  At the beginning

Feel free to use a Suggested-by if you consider it appropriate.

> of follow_page_mask, there currently is a call to follow_huge_addr which
> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> a follow_huge_addr routine that does not return error.  Instead, at each
> level of the page table a check is made for a hugetlb entry.  If a hugetlb
> entry is found, a call to a routine associated with that entry is made.
> 
> Currently, there are two checks for hugetlb entries at each page table
> level.  The first check is of the form:
> 	if (p?d_huge())
> 		page = follow_huge_p?d();
> the second check is of the form:
> 	if (is_hugepd())
> 		page = follow_huge_pd().

BTW, what about all this hugepd stuff in mm/pagewalk.c?

Isn't this all dead code as we're essentially routing all hugetlb VMAs
via walk_hugetlb_range? [yes, all that hugepd stuff in generic code that
overcomplicates stuff has been annoying me for a long time]

> 
> We can replace these checks, as well as the special handling routines
> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> handle hugetlb vmas.
> 
> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> existing routine huge_pte_offset to walk page tables looking for hugetlb
> entries.  huge_pte_offset can be overwritten by architectures, and already
> handles special cases such as hugepd entries.
> 
> [1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>

[...]

> +static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	/* should never happen, but do not want to BUG */
> +	return ERR_PTR(-EINVAL);

Should there be a WARN_ON_ONCE() instead or could we use a BUILD_BUG_ON()?

> +}


[...]

> @@ -851,10 +814,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
>  
>  	ctx->page_mask = 0;
>  
> -	/* make this handle hugepd */
> -	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
> -	if (!IS_ERR(page)) {
> -		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
> +	/*
> +	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
> +	 * special hugetlb page table walking code.  This eliminates the
> +	 * need to check for hugetlb entries in the general walking code.
> +	 */

Maybe also comment that ordinary GUP never ends up in here and instead
directly uses follow_hugetlb_page(). This is for follow_page() handling
only.

[me suggestion to rename follow_hugetlb_page() still stands ;) ]

> +	if (is_vm_hugetlb_page(vma)) {
> +		page = hugetlb_follow_page_mask(vma, address, flags);
> +		if (!page)
> +			page = no_page_table(vma, flags);
>  		return page;
>  	}
>  
> @@ -863,21 +831,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
>  	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
>  		return no_page_table(vma, flags);
>  
> -	if (pgd_huge(*pgd)) {
> -		page = follow_huge_pgd(mm, address, pgd, flags);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
> -		page = follow_huge_pd(vma, address,
> -				      __hugepd(pgd_val(*pgd)), flags,
> -				      PGDIR_SHIFT);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
> -
>  	return follow_p4d_mask(vma, address, pgd, flags, ctx);
>  }
>  
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d0617d64d718..b3da421ba5be 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6190,6 +6190,62 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
>  	return false;
>  }
>  
> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	struct hstate *h = hstate_vma(vma);
> +	struct mm_struct *mm = vma->vm_mm;
> +	unsigned long haddr = address & huge_page_mask(h);
> +	struct page *page = NULL;
> +	spinlock_t *ptl;
> +	pte_t *pte, entry;
> +
> +	/*
> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> +	 * follow_hugetlb_page().
> +	 */
> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
> +		return NULL;
> +
> +	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
> +	if (!pte)
> +		return NULL;
> +
> +retry:
> +	ptl = huge_pte_lock(h, mm, pte);
> +	entry = huge_ptep_get(pte);
> +	if (pte_present(entry)) {
> +		page = pte_page(entry) +
> +				((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
> +		/*
> +		 * Note that page may be a sub-page, and with vmemmap
> +		 * optimizations the page struct may be read only.
> +		 * try_grab_page() will increase the ref count on the
> +		 * head page, so this will be OK.
> +		 *
> +		 * try_grab_page() should always succeed here, because we hold
> +		 * the ptl lock and have verified pte_present().
> +		 */
> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> +			page = NULL;
> +			goto out;
> +		}
> +	} else {
> +		if (is_hugetlb_entry_migration(entry)) {
> +			spin_unlock(ptl);
> +			__migration_entry_wait_huge(pte, ptl);
> +			goto retry;
> +		}
> +		/*
> +		 * hwpoisoned entry is treated as no_page_table in
> +		 * follow_page_mask().
> +		 */
> +	}
> +out:
> +	spin_unlock(ptl);
> +	return page;
> +}
> +
>  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>  			 struct page **pages, struct vm_area_struct **vmas,
>  			 unsigned long *position, unsigned long *nr_pages,
> @@ -7140,123 +7196,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
>   * These functions are overwritable if your architecture needs its own
>   * behavior.
>   */

[...]

Numbers speak for themselves.

Acked-by: David Hildenbrand <david@redhat.com>
Mike Kravetz Aug. 30, 2022, 4:44 p.m. UTC | #3
On 08/30/22 09:06, Baolin Wang wrote:
> Hi Mike,
> 
> On 8/30/2022 7:40 AM, Mike Kravetz wrote:
> > During discussions of this series [1], it was suggested that hugetlb
> > handling code in follow_page_mask could be simplified.  At the beginning
> > of follow_page_mask, there currently is a call to follow_huge_addr which
> > 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> > a follow_huge_addr routine that does not return error.  Instead, at each
> > level of the page table a check is made for a hugetlb entry.  If a hugetlb
> > entry is found, a call to a routine associated with that entry is made.
> > 
> > Currently, there are two checks for hugetlb entries at each page table
> > level.  The first check is of the form:
> > 	if (p?d_huge())
> > 		page = follow_huge_p?d();
> > the second check is of the form:
> > 	if (is_hugepd())
> > 		page = follow_huge_pd().
> > 
> > We can replace these checks, as well as the special handling routines
> > such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> > handle hugetlb vmas.
> > 
> > A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> > beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> > existing routine huge_pte_offset to walk page tables looking for hugetlb
> > entries.  huge_pte_offset can be overwritten by architectures, and already
> > handles special cases such as hugepd entries.
> 
> Could you also mention that this patch will fix the lock issue for
> CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
> people to understand the issue.

Will update message in v2.  Thanks for taking a look!

> 
> Otherwise the changes look good to me.
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Mike Kravetz Aug. 30, 2022, 4:52 p.m. UTC | #4
On 08/30/22 10:11, David Hildenbrand wrote:
> On 30.08.22 01:40, Mike Kravetz wrote:
> > During discussions of this series [1], it was suggested that hugetlb
> > handling code in follow_page_mask could be simplified.  At the beginning
> 
> Feel free to use a Suggested-by if you consider it appropriate.
> 
> > of follow_page_mask, there currently is a call to follow_huge_addr which
> > 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> > a follow_huge_addr routine that does not return error.  Instead, at each
> > level of the page table a check is made for a hugetlb entry.  If a hugetlb
> > entry is found, a call to a routine associated with that entry is made.
> > 
> > Currently, there are two checks for hugetlb entries at each page table
> > level.  The first check is of the form:
> > 	if (p?d_huge())
> > 		page = follow_huge_p?d();
> > the second check is of the form:
> > 	if (is_hugepd())
> > 		page = follow_huge_pd().
> 
> BTW, what about all this hugepd stuff in mm/pagewalk.c?
> 
> Isn't this all dead code as we're essentially routing all hugetlb VMAs
> via walk_hugetlb_range? [yes, all that hugepd stuff in generic code that
> overcomplicates stuff has been annoying me for a long time]

I am 'happy' to look at cleaning up that code next.  Perhaps I will just
create a cleanup series.

I just wanted to focus on eliminating the two callouts in generic code mentioned
above: follow_huge_p?d() and follow_huge_pd().

Really looking for input from Aneesh and Naoya as they added much of the
code that is being removed here.

> > 
> > We can replace these checks, as well as the special handling routines
> > such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> > handle hugetlb vmas.
> > 
> > A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> > beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> > existing routine huge_pte_offset to walk page tables looking for hugetlb
> > entries.  huge_pte_offset can be overwritten by architectures, and already
> > handles special cases such as hugepd entries.
> > 
> > [1] https://lore.kernel.org/linux-mm/cover.1661240170.git.baolin.wang@linux.alibaba.com/
> > Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> 
> [...]
> 
> > +static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> > +				unsigned long address, unsigned int flags)
> > +{
> > +	/* should never happen, but do not want to BUG */
> > +	return ERR_PTR(-EINVAL);
> 
> Should there be a WARN_ON_ONCE() instead or could we use a BUILD_BUG_ON()?
> 

Ok, I will look into adding one of these.  Prefer a BUILD_BUG_ON().

> > +}
> 
> 
> [...]
> 
> > @@ -851,10 +814,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
> >  
> >  	ctx->page_mask = 0;
> >  
> > -	/* make this handle hugepd */
> > -	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
> > -	if (!IS_ERR(page)) {
> > -		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
> > +	/*
> > +	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
> > +	 * special hugetlb page table walking code.  This eliminates the
> > +	 * need to check for hugetlb entries in the general walking code.
> > +	 */
> 
> Maybe also comment that ordinary GUP never ends up in here and instead
> directly uses follow_hugetlb_page(). This is for follow_page() handling
> only.
> 
> [me suggestion to rename follow_hugetlb_page() still stands ;) ]

Will update the comment in v2.

I think renaming follow_hugetlb_page() would be in a separate patch.  Perhaps,
included in a larger cleanup series.  I will not forget. :)

> 
> Numbers speak for themselves.
> 
> Acked-by: David Hildenbrand <david@redhat.com>
> 

Thanks,
Mike Kravetz Aug. 30, 2022, 6:39 p.m. UTC | #5
On 08/30/22 09:44, Mike Kravetz wrote:
> On 08/30/22 09:06, Baolin Wang wrote:
> > Hi Mike,
> > 
> > On 8/30/2022 7:40 AM, Mike Kravetz wrote:
> > > During discussions of this series [1], it was suggested that hugetlb
> > > handling code in follow_page_mask could be simplified.  At the beginning
> > > of follow_page_mask, there currently is a call to follow_huge_addr which
> > > 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> > > a follow_huge_addr routine that does not return error.  Instead, at each
> > > level of the page table a check is made for a hugetlb entry.  If a hugetlb
> > > entry is found, a call to a routine associated with that entry is made.
> > > 
> > > Currently, there are two checks for hugetlb entries at each page table
> > > level.  The first check is of the form:
> > > 	if (p?d_huge())
> > > 		page = follow_huge_p?d();
> > > the second check is of the form:
> > > 	if (is_hugepd())
> > > 		page = follow_huge_pd().
> > > 
> > > We can replace these checks, as well as the special handling routines
> > > such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> > > handle hugetlb vmas.
> > > 
> > > A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> > > beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> > > existing routine huge_pte_offset to walk page tables looking for hugetlb
> > > entries.  huge_pte_offset can be overwritten by architectures, and already
> > > handles special cases such as hugepd entries.
> > 
> > Could you also mention that this patch will fix the lock issue for
> > CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
> > people to understand the issue.
> 
> Will update message in v2.  Thanks for taking a look!
> 

One additional thought, we 'may' need a separate patch to fix the locking
issues that can be easily backported.  Not sure this 'simplification' is
a good backport candidate.
Mike Kravetz Aug. 30, 2022, 9:31 p.m. UTC | #6
On 08/30/22 09:52, Mike Kravetz wrote:
> On 08/30/22 10:11, David Hildenbrand wrote:
> > On 30.08.22 01:40, Mike Kravetz wrote:
> > > During discussions of this series [1], it was suggested that hugetlb
> > > handling code in follow_page_mask could be simplified.  At the beginning
> > 
> > Feel free to use a Suggested-by if you consider it appropriate.
> > 
> > > of follow_page_mask, there currently is a call to follow_huge_addr which
> > > 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> > > a follow_huge_addr routine that does not return error.  Instead, at each
> > > level of the page table a check is made for a hugetlb entry.  If a hugetlb
> > > entry is found, a call to a routine associated with that entry is made.
> > > 
> > > Currently, there are two checks for hugetlb entries at each page table
> > > level.  The first check is of the form:
> > > 	if (p?d_huge())
> > > 		page = follow_huge_p?d();
> > > the second check is of the form:
> > > 	if (is_hugepd())
> > > 		page = follow_huge_pd().
> > 
> > BTW, what about all this hugepd stuff in mm/pagewalk.c?
> > 
> > Isn't this all dead code as we're essentially routing all hugetlb VMAs
> > via walk_hugetlb_range? [yes, all that hugepd stuff in generic code that
> > overcomplicates stuff has been annoying me for a long time]
> 
> I am 'happy' to look at cleaning up that code next.  Perhaps I will just
> create a cleanup series.
> 

Technically, that code is not dead IIUC.  The call to walk_hugetlb_range in
__walk_page_range is as follows:

	if (vma && is_vm_hugetlb_page(vma)) {
		if (ops->hugetlb_entry)
			err = walk_hugetlb_range(start, end, walk);
	} else
		err = walk_pgd_range(start, end, walk);

We also have the interface walk_page_range_novma() that will call
__walk_page_range without a value for vma.  So, in that case we would
end up calling walk_pgd_range, etc.  walk_pgd_range and related routines
do have those checks such as:

		if (is_hugepd(__hugepd(pmd_val(*pmd))))
			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);

So, it looks like in this case we would process 'hugepd' entries but not
'normal' hugetlb entries.  That does not seem right.

Christophe Leroy added this code with commit e17eae2b8399 "mm: pagewalk: fix
walk for hugepage tables".  This was part of the series "Convert powerpc to
GENERIC_PTDUMP".  And, the ptdump code uses the walk_page_range_novma
interface.  So, this code is certainly not dead.

Adding Christophe on Cc:

Christophe do you know if is_hugepd is true for all hugetlb entries, not
just hugepd?

On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
Sigh!
Baolin Wang Aug. 31, 2022, 1:07 a.m. UTC | #7
On 8/31/2022 2:39 AM, Mike Kravetz wrote:
> On 08/30/22 09:44, Mike Kravetz wrote:
>> On 08/30/22 09:06, Baolin Wang wrote:
>>> Hi Mike,
>>>
>>> On 8/30/2022 7:40 AM, Mike Kravetz wrote:
>>>> During discussions of this series [1], it was suggested that hugetlb
>>>> handling code in follow_page_mask could be simplified.  At the beginning
>>>> of follow_page_mask, there currently is a call to follow_huge_addr which
>>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
>>>> a follow_huge_addr routine that does not return error.  Instead, at each
>>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
>>>> entry is found, a call to a routine associated with that entry is made.
>>>>
>>>> Currently, there are two checks for hugetlb entries at each page table
>>>> level.  The first check is of the form:
>>>> 	if (p?d_huge())
>>>> 		page = follow_huge_p?d();
>>>> the second check is of the form:
>>>> 	if (is_hugepd())
>>>> 		page = follow_huge_pd().
>>>>
>>>> We can replace these checks, as well as the special handling routines
>>>> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
>>>> handle hugetlb vmas.
>>>>
>>>> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
>>>> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
>>>> existing routine huge_pte_offset to walk page tables looking for hugetlb
>>>> entries.  huge_pte_offset can be overwritten by architectures, and already
>>>> handles special cases such as hugepd entries.
>>>
>>> Could you also mention that this patch will fix the lock issue for
>>> CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
>>> people to understand the issue.
>>
>> Will update message in v2.  Thanks for taking a look!
>>
> 
> One additional thought, we 'may' need a separate patch to fix the locking
> issues that can be easily backported.  Not sure this 'simplification' is
> a good backport candidate.

Yes, that was my thought before, but David did not like adding more 
make-legacy-cruft-happy code.

So how about creating a series that contains 3 patches: picking up patch 
1 and patch 3 of my previous series [1], and your current patch? That 
means patch 1 and patch 2 in this series can fix the lock issue 
explicitly and be suitable to backport, meanwhile patch 3 (which is your 
current patch) will cleanup the legacy code.

[1] 
https://lore.kernel.org/all/cover.1661240170.git.baolin.wang@linux.alibaba.com/
kernel test robot Aug. 31, 2022, 5:08 a.m. UTC | #8
Hi Mike,

I love your patch! Yet something to improve:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on linus/master v6.0-rc3 next-20220830]
[cannot apply to powerpc/next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Mike-Kravetz/hugetlb-simplify-hugetlb-handling-in-follow_page_mask/20220830-074147
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
config: powerpc-randconfig-r001-20220830 (https://download.01.org/0day-ci/archive/20220831/202208311341.ybNgt0Kz-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/f7dc41c1552ecd1e483a100c8b0921df62980f38
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Mike-Kravetz/hugetlb-simplify-hugetlb-handling-in-follow_page_mask/20220830-074147
        git checkout f7dc41c1552ecd1e483a100c8b0921df62980f38
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/kernel/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from arch/powerpc/kernel/setup-common.c:35:
>> include/linux/hugetlb.h:258:21: error: 'hugetlb_follow_page_mask' defined but not used [-Werror=unused-function]
     258 | static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
         |                     ^~~~~~~~~~~~~~~~~~~~~~~~
   cc1: all warnings being treated as errors


vim +/hugetlb_follow_page_mask +258 include/linux/hugetlb.h

   257	
 > 258	static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
   259					unsigned long address, unsigned int flags)
   260	{
   261		/* should never happen, but do not want to BUG */
   262		return ERR_PTR(-EINVAL);
   263	}
   264
David Hildenbrand Aug. 31, 2022, 8:07 a.m. UTC | #9
On 30.08.22 23:31, Mike Kravetz wrote:
> On 08/30/22 09:52, Mike Kravetz wrote:
>> On 08/30/22 10:11, David Hildenbrand wrote:
>>> On 30.08.22 01:40, Mike Kravetz wrote:
>>>> During discussions of this series [1], it was suggested that hugetlb
>>>> handling code in follow_page_mask could be simplified.  At the beginning
>>>
>>> Feel free to use a Suggested-by if you consider it appropriate.
>>>
>>>> of follow_page_mask, there currently is a call to follow_huge_addr which
>>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
>>>> a follow_huge_addr routine that does not return error.  Instead, at each
>>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
>>>> entry is found, a call to a routine associated with that entry is made.
>>>>
>>>> Currently, there are two checks for hugetlb entries at each page table
>>>> level.  The first check is of the form:
>>>> 	if (p?d_huge())
>>>> 		page = follow_huge_p?d();
>>>> the second check is of the form:
>>>> 	if (is_hugepd())
>>>> 		page = follow_huge_pd().
>>>
>>> BTW, what about all this hugepd stuff in mm/pagewalk.c?
>>>
>>> Isn't this all dead code as we're essentially routing all hugetlb VMAs
>>> via walk_hugetlb_range? [yes, all that hugepd stuff in generic code that
>>> overcomplicates stuff has been annoying me for a long time]
>>
>> I am 'happy' to look at cleaning up that code next.  Perhaps I will just
>> create a cleanup series.
>>
> 
> Technically, that code is not dead IIUC.  The call to walk_hugetlb_range in
> __walk_page_range is as follows:
> 
> 	if (vma && is_vm_hugetlb_page(vma)) {
> 		if (ops->hugetlb_entry)
> 			err = walk_hugetlb_range(start, end, walk);
> 	} else
> 		err = walk_pgd_range(start, end, walk);
> 
> We also have the interface walk_page_range_novma() that will call
> __walk_page_range without a value for vma.  So, in that case we would
> end up calling walk_pgd_range, etc.  walk_pgd_range and related routines
> do have those checks such as:
> 
> 		if (is_hugepd(__hugepd(pmd_val(*pmd))))
> 			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
> 
> So, it looks like in this case we would process 'hugepd' entries but not
> 'normal' hugetlb entries.  That does not seem right.

:/ walking a hugetlb range without knowing whether it's a hugetlb range
is certainly questionable.


> 
> Christophe Leroy added this code with commit e17eae2b8399 "mm: pagewalk: fix
> walk for hugepage tables".  This was part of the series "Convert powerpc to
> GENERIC_PTDUMP".  And, the ptdump code uses the walk_page_range_novma
> interface.  So, this code is certainly not dead.

Hm, that commit doesn't actually mention how it can happen, what exactly
will happen ("crazy result") and if it ever happened.

> 
> Adding Christophe on Cc:
> 
> Christophe do you know if is_hugepd is true for all hugetlb entries, not
> just hugepd?
> 
> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
> Sigh!

IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
VMAs (for debugging purposes?).

I cannot convince myself that that's a good idea when only holding the
mmap lock in read mode, because we can just see page tables getting
freed concurrently e.g., during concurrent munmap() ... while holding
the mmap lock in read we may only walk inside VMA boundaries.

That then raises the questions if we're only calling this on special MMs
(e.g., init_mm) whereby we cannot really see concurrent munmap() and
where we shouldn't have hugetlb mappings or hugepd entries.
Mike Kravetz Aug. 31, 2022, 8:42 p.m. UTC | #10
On 08/31/22 13:08, kernel test robot wrote:
> Hi Mike,
> 
> I love your patch! Yet something to improve:
> 
> [auto build test ERROR on akpm-mm/mm-everything]
> [also build test ERROR on linus/master v6.0-rc3 next-20220830]
> [cannot apply to powerpc/next]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Mike-Kravetz/hugetlb-simplify-hugetlb-handling-in-follow_page_mask/20220830-074147
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
> config: powerpc-randconfig-r001-20220830 (https://download.01.org/0day-ci/archive/20220831/202208311341.ybNgt0Kz-lkp@intel.com/config)
> compiler: powerpc-linux-gcc (GCC) 12.1.0
> reproduce (this is a W=1 build):
>         wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>         chmod +x ~/bin/make.cross
>         # https://github.com/intel-lab-lkp/linux/commit/f7dc41c1552ecd1e483a100c8b0921df62980f38
>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>         git fetch --no-tags linux-review Mike-Kravetz/hugetlb-simplify-hugetlb-handling-in-follow_page_mask/20220830-074147
>         git checkout f7dc41c1552ecd1e483a100c8b0921df62980f38
>         # save the config file
>         mkdir build_dir && cp config build_dir/.config
>         COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=powerpc SHELL=/bin/bash arch/powerpc/kernel/
> 
> If you fix the issue, kindly add following tag where applicable
> Reported-by: kernel test robot <lkp@intel.com>
> 
> All errors (new ones prefixed by >>):
> 
>    In file included from arch/powerpc/kernel/setup-common.c:35:
> >> include/linux/hugetlb.h:258:21: error: 'hugetlb_follow_page_mask' defined but not used [-Werror=unused-function]
>      258 | static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
>          |                     ^~~~~~~~~~~~~~~~~~~~~~~~
>    cc1: all warnings being treated as errors
> 
> 
> vim +/hugetlb_follow_page_mask +258 include/linux/hugetlb.h
> 
>    257	
>  > 258	static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,

Thanks! That should be,

		static inline  struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
Mike Kravetz Sept. 1, 2022, midnight UTC | #11
On 08/31/22 09:07, Baolin Wang wrote:
> 
> 
> On 8/31/2022 2:39 AM, Mike Kravetz wrote:
> > On 08/30/22 09:44, Mike Kravetz wrote:
> > > On 08/30/22 09:06, Baolin Wang wrote:
> > > > Hi Mike,
> > > > 
> > > > On 8/30/2022 7:40 AM, Mike Kravetz wrote:
> > > > > During discussions of this series [1], it was suggested that hugetlb
> > > > > handling code in follow_page_mask could be simplified.  At the beginning
> > > > > of follow_page_mask, there currently is a call to follow_huge_addr which
> > > > > 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> > > > > a follow_huge_addr routine that does not return error.  Instead, at each
> > > > > level of the page table a check is made for a hugetlb entry.  If a hugetlb
> > > > > entry is found, a call to a routine associated with that entry is made.
> > > > > 
> > > > > Currently, there are two checks for hugetlb entries at each page table
> > > > > level.  The first check is of the form:
> > > > > 	if (p?d_huge())
> > > > > 		page = follow_huge_p?d();
> > > > > the second check is of the form:
> > > > > 	if (is_hugepd())
> > > > > 		page = follow_huge_pd().
> > > > > 
> > > > > We can replace these checks, as well as the special handling routines
> > > > > such as follow_huge_p?d() and follow_huge_pd() with a single routine to
> > > > > handle hugetlb vmas.
> > > > > 
> > > > > A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> > > > > beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> > > > > existing routine huge_pte_offset to walk page tables looking for hugetlb
> > > > > entries.  huge_pte_offset can be overwritten by architectures, and already
> > > > > handles special cases such as hugepd entries.
> > > > 
> > > > Could you also mention that this patch will fix the lock issue for
> > > > CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
> > > > people to understand the issue.
> > > 
> > > Will update message in v2.  Thanks for taking a look!
> > > 
> > 
> > One additional thought, we 'may' need a separate patch to fix the locking
> > issues that can be easily backported.  Not sure this 'simplification' is
> > a good backport candidate.
> 
> Yes, that was my thought before, but David did not like adding more
> make-legacy-cruft-happy code.
> 
> So how about creating a series that contains 3 patches: picking up patch 1
> and patch 3 of my previous series [1], and your current patch? That means
> patch 1 and patch 2 in this series can fix the lock issue explicitly and be
> suitable to backport, meanwhile patch 3 (which is your current patch) will
> cleanup the legacy code.
> 

When I looked at patch 3, I was thinking the update follow_huge_pmd routine
would work for the PTE level with a few more modifications.  Perhaps, this is
too ugly but it is a smaller set of changes for backport.

Of course, this would be followed up with the simplification patch which
removes all this code.
Baolin Wang Sept. 1, 2022, 1:24 a.m. UTC | #12
On 9/1/2022 8:00 AM, Mike Kravetz wrote:
> On 08/31/22 09:07, Baolin Wang wrote:
>>
>>
>> On 8/31/2022 2:39 AM, Mike Kravetz wrote:
>>> On 08/30/22 09:44, Mike Kravetz wrote:
>>>> On 08/30/22 09:06, Baolin Wang wrote:
>>>>> Hi Mike,
>>>>>
>>>>> On 8/30/2022 7:40 AM, Mike Kravetz wrote:
>>>>>> During discussions of this series [1], it was suggested that hugetlb
>>>>>> handling code in follow_page_mask could be simplified.  At the beginning
>>>>>> of follow_page_mask, there currently is a call to follow_huge_addr which
>>>>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
>>>>>> a follow_huge_addr routine that does not return error.  Instead, at each
>>>>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
>>>>>> entry is found, a call to a routine associated with that entry is made.
>>>>>>
>>>>>> Currently, there are two checks for hugetlb entries at each page table
>>>>>> level.  The first check is of the form:
>>>>>> 	if (p?d_huge())
>>>>>> 		page = follow_huge_p?d();
>>>>>> the second check is of the form:
>>>>>> 	if (is_hugepd())
>>>>>> 		page = follow_huge_pd().
>>>>>>
>>>>>> We can replace these checks, as well as the special handling routines
>>>>>> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
>>>>>> handle hugetlb vmas.
>>>>>>
>>>>>> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
>>>>>> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
>>>>>> existing routine huge_pte_offset to walk page tables looking for hugetlb
>>>>>> entries.  huge_pte_offset can be overwritten by architectures, and already
>>>>>> handles special cases such as hugepd entries.
>>>>>
>>>>> Could you also mention that this patch will fix the lock issue for
>>>>> CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
>>>>> people to understand the issue.
>>>>
>>>> Will update message in v2.  Thanks for taking a look!
>>>>
>>>
>>> One additional thought, we 'may' need a separate patch to fix the locking
>>> issues that can be easily backported.  Not sure this 'simplification' is
>>> a good backport candidate.
>>
>> Yes, that was my thought before, but David did not like adding more
>> make-legacy-cruft-happy code.
>>
>> So how about creating a series that contains 3 patches: picking up patch 1
>> and patch 3 of my previous series [1], and your current patch? That means
>> patch 1 and patch 2 in this series can fix the lock issue explicitly and be
>> suitable to backport, meanwhile patch 3 (which is your current patch) will
>> cleanup the legacy code.
>>
> 
> When I looked at patch 3, I was thinking the update follow_huge_pmd routine
> would work for the PTE level with a few more modifications.  Perhaps, this is
> too ugly but it is a smaller set of changes for backport.
> 
> Of course, this would be followed up with the simplification patch which
> removes all this code.

Yes, looks more simple. I can send you a formal patch with your 
suggestion, which can be added into your cleanup series. Thanks.
David Hildenbrand Sept. 1, 2022, 6:59 a.m. UTC | #13
On 01.09.22 03:24, Baolin Wang wrote:
> 
> 
> On 9/1/2022 8:00 AM, Mike Kravetz wrote:
>> On 08/31/22 09:07, Baolin Wang wrote:
>>>
>>>
>>> On 8/31/2022 2:39 AM, Mike Kravetz wrote:
>>>> On 08/30/22 09:44, Mike Kravetz wrote:
>>>>> On 08/30/22 09:06, Baolin Wang wrote:
>>>>>> Hi Mike,
>>>>>>
>>>>>> On 8/30/2022 7:40 AM, Mike Kravetz wrote:
>>>>>>> During discussions of this series [1], it was suggested that hugetlb
>>>>>>> handling code in follow_page_mask could be simplified.  At the beginning
>>>>>>> of follow_page_mask, there currently is a call to follow_huge_addr which
>>>>>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
>>>>>>> a follow_huge_addr routine that does not return error.  Instead, at each
>>>>>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
>>>>>>> entry is found, a call to a routine associated with that entry is made.
>>>>>>>
>>>>>>> Currently, there are two checks for hugetlb entries at each page table
>>>>>>> level.  The first check is of the form:
>>>>>>> 	if (p?d_huge())
>>>>>>> 		page = follow_huge_p?d();
>>>>>>> the second check is of the form:
>>>>>>> 	if (is_hugepd())
>>>>>>> 		page = follow_huge_pd().
>>>>>>>
>>>>>>> We can replace these checks, as well as the special handling routines
>>>>>>> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
>>>>>>> handle hugetlb vmas.
>>>>>>>
>>>>>>> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
>>>>>>> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
>>>>>>> existing routine huge_pte_offset to walk page tables looking for hugetlb
>>>>>>> entries.  huge_pte_offset can be overwritten by architectures, and already
>>>>>>> handles special cases such as hugepd entries.
>>>>>>
>>>>>> Could you also mention that this patch will fix the lock issue for
>>>>>> CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
>>>>>> people to understand the issue.
>>>>>
>>>>> Will update message in v2.  Thanks for taking a look!
>>>>>
>>>>
>>>> One additional thought, we 'may' need a separate patch to fix the locking
>>>> issues that can be easily backported.  Not sure this 'simplification' is
>>>> a good backport candidate.
>>>
>>> Yes, that was my thought before, but David did not like adding more
>>> make-legacy-cruft-happy code.
>>>
>>> So how about creating a series that contains 3 patches: picking up patch 1
>>> and patch 3 of my previous series [1], and your current patch? That means
>>> patch 1 and patch 2 in this series can fix the lock issue explicitly and be
>>> suitable to backport, meanwhile patch 3 (which is your current patch) will
>>> cleanup the legacy code.
>>>
>>
>> When I looked at patch 3, I was thinking the update follow_huge_pmd routine
>> would work for the PTE level with a few more modifications.  Perhaps, this is
>> too ugly but it is a smaller set of changes for backport.
>>
>> Of course, this would be followed up with the simplification patch which
>> removes all this code.
> 
> Yes, looks more simple. I can send you a formal patch with your 
> suggestion, which can be added into your cleanup series. Thanks.

As an alternative, we can have a stable-only version that does that.
Baolin Wang Sept. 1, 2022, 10:40 a.m. UTC | #14
On 9/1/2022 2:59 PM, David Hildenbrand wrote:
> On 01.09.22 03:24, Baolin Wang wrote:
>>
>>
>> On 9/1/2022 8:00 AM, Mike Kravetz wrote:
>>> On 08/31/22 09:07, Baolin Wang wrote:
>>>>
>>>>
>>>> On 8/31/2022 2:39 AM, Mike Kravetz wrote:
>>>>> On 08/30/22 09:44, Mike Kravetz wrote:
>>>>>> On 08/30/22 09:06, Baolin Wang wrote:
>>>>>>> Hi Mike,
>>>>>>>
>>>>>>> On 8/30/2022 7:40 AM, Mike Kravetz wrote:
>>>>>>>> During discussions of this series [1], it was suggested that hugetlb
>>>>>>>> handling code in follow_page_mask could be simplified.  At the beginning
>>>>>>>> of follow_page_mask, there currently is a call to follow_huge_addr which
>>>>>>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
>>>>>>>> a follow_huge_addr routine that does not return error.  Instead, at each
>>>>>>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
>>>>>>>> entry is found, a call to a routine associated with that entry is made.
>>>>>>>>
>>>>>>>> Currently, there are two checks for hugetlb entries at each page table
>>>>>>>> level.  The first check is of the form:
>>>>>>>> 	if (p?d_huge())
>>>>>>>> 		page = follow_huge_p?d();
>>>>>>>> the second check is of the form:
>>>>>>>> 	if (is_hugepd())
>>>>>>>> 		page = follow_huge_pd().
>>>>>>>>
>>>>>>>> We can replace these checks, as well as the special handling routines
>>>>>>>> such as follow_huge_p?d() and follow_huge_pd() with a single routine to
>>>>>>>> handle hugetlb vmas.
>>>>>>>>
>>>>>>>> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
>>>>>>>> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
>>>>>>>> existing routine huge_pte_offset to walk page tables looking for hugetlb
>>>>>>>> entries.  huge_pte_offset can be overwritten by architectures, and already
>>>>>>>> handles special cases such as hugepd entries.
>>>>>>>
>>>>>>> Could you also mention that this patch will fix the lock issue for
>>>>>>> CONT-PTE/PMD hugetlb by changing to use huge_pte_lock()? which will help
>>>>>>> people to understand the issue.
>>>>>>
>>>>>> Will update message in v2.  Thanks for taking a look!
>>>>>>
>>>>>
>>>>> One additional thought, we 'may' need a separate patch to fix the locking
>>>>> issues that can be easily backported.  Not sure this 'simplification' is
>>>>> a good backport candidate.
>>>>
>>>> Yes, that was my thought before, but David did not like adding more
>>>> make-legacy-cruft-happy code.
>>>>
>>>> So how about creating a series that contains 3 patches: picking up patch 1
>>>> and patch 3 of my previous series [1], and your current patch? That means
>>>> patch 1 and patch 2 in this series can fix the lock issue explicitly and be
>>>> suitable to backport, meanwhile patch 3 (which is your current patch) will
>>>> cleanup the legacy code.
>>>>
>>>
>>> When I looked at patch 3, I was thinking the update follow_huge_pmd routine
>>> would work for the PTE level with a few more modifications.  Perhaps, this is
>>> too ugly but it is a smaller set of changes for backport.
>>>
>>> Of course, this would be followed up with the simplification patch which
>>> removes all this code.
>>
>> Yes, looks more simple. I can send you a formal patch with your
>> suggestion, which can be added into your cleanup series. Thanks.
> 
> As an alternative, we can have a stable-only version that does that.

But from stable-kernel-rules, we should follow "It or an equivalent fix 
must already exist in Linus' tree (upstream)."
Mike Kravetz Sept. 1, 2022, 4:19 p.m. UTC | #15
On 08/29/22 16:40, Mike Kravetz wrote:
> A new routine hugetlb_follow_page_mask is called for hugetlb vmas at the
> beginning of follow_page_mask.  hugetlb_follow_page_mask will use the
> existing routine huge_pte_offset to walk page tables looking for hugetlb
> entries.  huge_pte_offset can be overwritten by architectures, and already
> handles special cases such as hugepd entries.
> 
<snip>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d0617d64d718..b3da421ba5be 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6190,6 +6190,62 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
>  	return false;
>  }
>  
> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	struct hstate *h = hstate_vma(vma);
> +	struct mm_struct *mm = vma->vm_mm;
> +	unsigned long haddr = address & huge_page_mask(h);
> +	struct page *page = NULL;
> +	spinlock_t *ptl;
> +	pte_t *pte, entry;
> +
> +	/*
> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> +	 * follow_hugetlb_page().
> +	 */
> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
> +		return NULL;
> +
> +	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
> +	if (!pte)
> +		return NULL;
> +
> +retry:
> +	ptl = huge_pte_lock(h, mm, pte);

I can't believe I forgot about huge pmd sharing as described here!!!
https://lore.kernel.org/linux-mm/20220824175757.20590-1-mike.kravetz@oracle.com/

The above series is in Andrew's tree, and we should add 'vma locking' calls
to this routine.

Do note that the existing page walking code can race with pmd unsharing.
I would NOT suggest trying to address this in stable releases.  To date,
I am unaware of any issues caused by races with pmd unsharing.  Trying
to take this into account in 'generic page walking code', could get ugly.
Since hugetlb_follow_page_mask will be a special callout for hugetlb page
table walking, we can easily add the required locking and address the
potential race issue.  This will be in v2.

Still hoping to get some feedback from Aneesh and Naoya about this approach.
Mike Kravetz Sept. 2, 2022, 6:50 p.m. UTC | #16
On 08/31/22 10:07, David Hildenbrand wrote:
> On 30.08.22 23:31, Mike Kravetz wrote:
> > On 08/30/22 09:52, Mike Kravetz wrote:
> >> On 08/30/22 10:11, David Hildenbrand wrote:
> >>> On 30.08.22 01:40, Mike Kravetz wrote:
> >>>> During discussions of this series [1], it was suggested that hugetlb
> >>>> handling code in follow_page_mask could be simplified.  At the beginning
> >>>
> >>> Feel free to use a Suggested-by if you consider it appropriate.
> >>>
> >>>> of follow_page_mask, there currently is a call to follow_huge_addr which
> >>>> 'may' handle hugetlb pages.  ia64 is the only architecture which provides
> >>>> a follow_huge_addr routine that does not return error.  Instead, at each
> >>>> level of the page table a check is made for a hugetlb entry.  If a hugetlb
> >>>> entry is found, a call to a routine associated with that entry is made.
> >>>>
> >>>> Currently, there are two checks for hugetlb entries at each page table
> >>>> level.  The first check is of the form:
> >>>> 	if (p?d_huge())
> >>>> 		page = follow_huge_p?d();
> >>>> the second check is of the form:
> >>>> 	if (is_hugepd())
> >>>> 		page = follow_huge_pd().
> >>>
> >>> BTW, what about all this hugepd stuff in mm/pagewalk.c?
> >>>
> >>> Isn't this all dead code as we're essentially routing all hugetlb VMAs
> >>> via walk_hugetlb_range? [yes, all that hugepd stuff in generic code that
> >>> overcomplicates stuff has been annoying me for a long time]
> >>
> >> I am 'happy' to look at cleaning up that code next.  Perhaps I will just
> >> create a cleanup series.
> >>
> > 
> > Technically, that code is not dead IIUC.  The call to walk_hugetlb_range in
> > __walk_page_range is as follows:
> > 
> > 	if (vma && is_vm_hugetlb_page(vma)) {
> > 		if (ops->hugetlb_entry)
> > 			err = walk_hugetlb_range(start, end, walk);
> > 	} else
> > 		err = walk_pgd_range(start, end, walk);
> > 
> > We also have the interface walk_page_range_novma() that will call
> > __walk_page_range without a value for vma.  So, in that case we would
> > end up calling walk_pgd_range, etc.  walk_pgd_range and related routines
> > do have those checks such as:
> > 
> > 		if (is_hugepd(__hugepd(pmd_val(*pmd))))
> > 			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
> > 
> > So, it looks like in this case we would process 'hugepd' entries but not
> > 'normal' hugetlb entries.  That does not seem right.
> 
> :/ walking a hugetlb range without knowing whether it's a hugetlb range
> is certainly questionable.
> 
> 
> > 
> > Christophe Leroy added this code with commit e17eae2b8399 "mm: pagewalk: fix
> > walk for hugepage tables".  This was part of the series "Convert powerpc to
> > GENERIC_PTDUMP".  And, the ptdump code uses the walk_page_range_novma
> > interface.  So, this code is certainly not dead.
> 
> Hm, that commit doesn't actually mention how it can happen, what exactly
> will happen ("crazy result") and if it ever happened.
> 
> > 
> > Adding Christophe on Cc:
> > 
> > Christophe do you know if is_hugepd is true for all hugetlb entries, not
> > just hugepd?
> > 
> > On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
> > Sigh!
> 
> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
> VMAs (for debugging purposes?).
> 
> I cannot convince myself that that's a good idea when only holding the
> mmap lock in read mode, because we can just see page tables getting
> freed concurrently e.g., during concurrent munmap() ... while holding
> the mmap lock in read we may only walk inside VMA boundaries.
> 
> That then raises the questions if we're only calling this on special MMs
> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
> where we shouldn't have hugetlb mappings or hugepd entries.
> 

This is going to require a little more thought.

Since Baolin's patch for stable releases is moving forward, I want to
get the cleanup provided by this patch in ASAP.  So, I am going to rebase
this patch on Baolin's with the other fixups.

Will come back to this cleanup later.
David Hildenbrand Sept. 2, 2022, 6:52 p.m. UTC | #17
>>> Adding Christophe on Cc:
>>>
>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>> just hugepd?
>>>
>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>> Sigh!
>>
>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>> VMAs (for debugging purposes?).
>>
>> I cannot convince myself that that's a good idea when only holding the
>> mmap lock in read mode, because we can just see page tables getting
>> freed concurrently e.g., during concurrent munmap() ... while holding
>> the mmap lock in read we may only walk inside VMA boundaries.
>>
>> That then raises the questions if we're only calling this on special MMs
>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>> where we shouldn't have hugetlb mappings or hugepd entries.
>>
> 
> This is going to require a little more thought.
> 
> Since Baolin's patch for stable releases is moving forward, I want to
> get the cleanup provided by this patch in ASAP.  So, I am going to rebase
> this patch on Baolin's with the other fixups.
> 
> Will come back to this cleanup later.

Sure, no need to do it all at once (I was just bringing it up while
thinking about it).
Christophe Leroy Sept. 3, 2022, 6:59 a.m. UTC | #18
Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>> Adding Christophe on Cc:
>>>>
>>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>>> just hugepd?

is_hugepd() is true if and only if the directory entry points to a huge 
page directory and not to the normal lower level directory.

As far as I understand if the directory entry is not pointing to any 
lower directory but is a huge page entry, pXd_leaf() is true.


>>>>
>>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>>> Sigh!

As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.

>>>
>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>>> VMAs (for debugging purposes?).
>>>
>>> I cannot convince myself that that's a good idea when only holding the
>>> mmap lock in read mode, because we can just see page tables getting
>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>
>>> That then raises the questions if we're only calling this on special MMs
>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>> where we shouldn't have hugetlb mappings or hugepd entries.

At least on powerpc, PTDUMP handles only init_mm.

Hugepage are used at least on powerpc 8xx for linear memory mapping, see

commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel 
via huge pages")
commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")

hugepds may also be used in the future to use huge pages for vmap and 
vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge 
pages on VMAP and VMALLOC")

As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see

commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")

Christophe
Christophe Leroy Sept. 3, 2022, 7:07 a.m. UTC | #19
+Resending with valid powerpc list address

Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>> Adding Christophe on Cc:
>>>>
>>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>>> just hugepd?

is_hugepd() is true if and only if the directory entry points to a huge 
page directory and not to the normal lower level directory.

As far as I understand if the directory entry is not pointing to any 
lower directory but is a huge page entry, pXd_leaf() is true.


>>>>
>>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>>> Sigh!

As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.

>>>
>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>>> VMAs (for debugging purposes?).
>>>
>>> I cannot convince myself that that's a good idea when only holding the
>>> mmap lock in read mode, because we can just see page tables getting
>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>
>>> That then raises the questions if we're only calling this on special MMs
>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>> where we shouldn't have hugetlb mappings or hugepd entries.

At least on powerpc, PTDUMP handles only init_mm.

Hugepage are used at least on powerpc 8xx for linear memory mapping, see

commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel 
via huge pages")
commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")

hugepds may also be used in the future to use huge pages for vmap and 
vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge 
pages on VMAP and VMALLOC")

As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see

commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")

Christophe
Michael Ellerman Sept. 4, 2022, 11:49 a.m. UTC | #20
Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> +Resending with valid powerpc list address
>
> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>> Adding Christophe on Cc:
>>>>>
>>>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>>>> just hugepd?
>
> is_hugepd() is true if and only if the directory entry points to a huge 
> page directory and not to the normal lower level directory.
>
> As far as I understand if the directory entry is not pointing to any 
> lower directory but is a huge page entry, pXd_leaf() is true.

Yes.

Though historically it's pXd_huge() which is used to test that, which is
gated by CONFIG_HUGETLB_PAGE.

The leaf versions are newer and test whether the entry is a PTE
regardless of whether CONFIG_HUGETLB_PAGE is enabled. Which is needed
for PTDUMP if the kernel mapping uses huge pages independently of
CONFIG_HUGETLB_PAGE, which is true on at least powerpc.

>>>>>
>>>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>>>> Sigh!
>
> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
>
>>>>
>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>>>> VMAs (for debugging purposes?).
>>>>
>>>> I cannot convince myself that that's a good idea when only holding the
>>>> mmap lock in read mode, because we can just see page tables getting
>>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>
>>>> That then raises the questions if we're only calling this on special MMs
>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>> where we shouldn't have hugetlb mappings or hugepd entries.
>
> At least on powerpc, PTDUMP handles only init_mm.
>
> Hugepage are used at least on powerpc 8xx for linear memory mapping, see
>
> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel 
> via huge pages")
> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
>
> hugepds may also be used in the future to use huge pages for vmap and 
> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge 
> pages on VMAP and VMALLOC")
>
> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
>
> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")

64-bit also uses huge pages for the kernel linear mapping (aka. direct
mapping), and on newer systems (>= Power9) those also appear in the
kernel page tables.

cheers
David Hildenbrand Sept. 5, 2022, 8:37 a.m. UTC | #21
On 03.09.22 09:07, Christophe Leroy wrote:
> +Resending with valid powerpc list address
> 
> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>> Adding Christophe on Cc:
>>>>>
>>>>> Christophe do you know if is_hugepd is true for all hugetlb entries, not
>>>>> just hugepd?
> 
> is_hugepd() is true if and only if the directory entry points to a huge
> page directory and not to the normal lower level directory.
> 
> As far as I understand if the directory entry is not pointing to any
> lower directory but is a huge page entry, pXd_leaf() is true.
> 
> 
>>>>>
>>>>> On systems without hugepd entries, I guess ptdump skips all hugetlb entries.
>>>>> Sigh!
> 
> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
> 
>>>>
>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even outside
>>>> VMAs (for debugging purposes?).
>>>>
>>>> I cannot convince myself that that's a good idea when only holding the
>>>> mmap lock in read mode, because we can just see page tables getting
>>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>
>>>> That then raises the questions if we're only calling this on special MMs
>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>> where we shouldn't have hugetlb mappings or hugepd entries.
> 
> At least on powerpc, PTDUMP handles only init_mm.
> 
> Hugepage are used at least on powerpc 8xx for linear memory mapping, see
> 
> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel
> via huge pages")
> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
> 
> hugepds may also be used in the future to use huge pages for vmap and
> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge
> pages on VMAP and VMALLOC")
> 
> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
> 
> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")

There is a difference between an ordinary huge mapping (e.g., as used 
for THP) and a a hugetlb mapping.

Our current understanding is that hugepd only applies to hugetlb. 
Wouldn't vmap/vmalloc user ordinary huge pmd entries instead of hugepd?
Christophe Leroy Sept. 5, 2022, 9:33 a.m. UTC | #22
Le 05/09/2022 à 10:37, David Hildenbrand a écrit :
> On 03.09.22 09:07, Christophe Leroy wrote:
>> +Resending with valid powerpc list address
>>
>> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>>> Adding Christophe on Cc:
>>>>>>
>>>>>> Christophe do you know if is_hugepd is true for all hugetlb 
>>>>>> entries, not
>>>>>> just hugepd?
>>
>> is_hugepd() is true if and only if the directory entry points to a huge
>> page directory and not to the normal lower level directory.
>>
>> As far as I understand if the directory entry is not pointing to any
>> lower directory but is a huge page entry, pXd_leaf() is true.
>>
>>
>>>>>>
>>>>>> On systems without hugepd entries, I guess ptdump skips all 
>>>>>> hugetlb entries.
>>>>>> Sigh!
>>
>> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
>>
>>>>>
>>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even 
>>>>> outside
>>>>> VMAs (for debugging purposes?).
>>>>>
>>>>> I cannot convince myself that that's a good idea when only holding the
>>>>> mmap lock in read mode, because we can just see page tables getting
>>>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>>
>>>>> That then raises the questions if we're only calling this on 
>>>>> special MMs
>>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>>> where we shouldn't have hugetlb mappings or hugepd entries.
>>
>> At least on powerpc, PTDUMP handles only init_mm.
>>
>> Hugepage are used at least on powerpc 8xx for linear memory mapping, see
>>
>> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel
>> via huge pages")
>> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
>>
>> hugepds may also be used in the future to use huge pages for vmap and
>> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge
>> pages on VMAP and VMALLOC")
>>
>> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
>>
>> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
>> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")
> 
> There is a difference between an ordinary huge mapping (e.g., as used 
> for THP) and a a hugetlb mapping.
> 
> Our current understanding is that hugepd only applies to hugetlb. 
> Wouldn't vmap/vmalloc user ordinary huge pmd entries instead of hugepd?
> 

'hugepd' stands for huge page directory. It is independant of whether a 
huge page is used for hugetlb or for anything else, it represents the 
way pages are described in the page tables.

I don't know what you mean by _ordinary_ huge pmd entry.

Let's take the exemple of powerpc 8xx which is the one I know best. This 
is a powerpc32, so it has two levels : PGD and PTE. PGD has 1024 entries 
and each entry covers a 4Mbytes area. Normal PTE has 1024 entries and 
each entry is a 4k page. When you use 8Mbytes pages, you don't use PTEs 
as it would be a waste of memory. You use a huge page directory that has 
a single entry, and you have two PGD entries pointing to the huge page 
directory.

Some time ago, hupgepd was also used for 512kbytes pages and 16kbytes 
pages:
- there was huge page directories with 8x 512kbytes pages,
- there was huge page directories with 256x 16kbytes pages,

And the PGD/PMD entry points to a huge page directory (HUGEPD) instead 
of pointing to a page table directory (PTE).

Since commit b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as 
standard pages."), the 8xx doesn't use anymore hugepd for 512k huge 
page, but other platforms like powerpc book3e extensively use huge page 
directories.

I hope this clarifies the subject, otherwise I'm happy to provide 
further details.

Christophe
David Hildenbrand Sept. 5, 2022, 9:46 a.m. UTC | #23
On 05.09.22 11:33, Christophe Leroy wrote:
> 
> 
> Le 05/09/2022 à 10:37, David Hildenbrand a écrit :
>> On 03.09.22 09:07, Christophe Leroy wrote:
>>> +Resending with valid powerpc list address
>>>
>>> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>>>> Adding Christophe on Cc:
>>>>>>>
>>>>>>> Christophe do you know if is_hugepd is true for all hugetlb
>>>>>>> entries, not
>>>>>>> just hugepd?
>>>
>>> is_hugepd() is true if and only if the directory entry points to a huge
>>> page directory and not to the normal lower level directory.
>>>
>>> As far as I understand if the directory entry is not pointing to any
>>> lower directory but is a huge page entry, pXd_leaf() is true.
>>>
>>>
>>>>>>>
>>>>>>> On systems without hugepd entries, I guess ptdump skips all
>>>>>>> hugetlb entries.
>>>>>>> Sigh!
>>>
>>> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
>>>
>>>>>>
>>>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even
>>>>>> outside
>>>>>> VMAs (for debugging purposes?).
>>>>>>
>>>>>> I cannot convince myself that that's a good idea when only holding the
>>>>>> mmap lock in read mode, because we can just see page tables getting
>>>>>> freed concurrently e.g., during concurrent munmap() ... while holding
>>>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>>>
>>>>>> That then raises the questions if we're only calling this on
>>>>>> special MMs
>>>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>>>> where we shouldn't have hugetlb mappings or hugepd entries.
>>>
>>> At least on powerpc, PTDUMP handles only init_mm.
>>>
>>> Hugepage are used at least on powerpc 8xx for linear memory mapping, see
>>>
>>> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel
>>> via huge pages")
>>> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
>>>
>>> hugepds may also be used in the future to use huge pages for vmap and
>>> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge
>>> pages on VMAP and VMALLOC")
>>>
>>> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
>>>
>>> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
>>> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")
>>
>> There is a difference between an ordinary huge mapping (e.g., as used
>> for THP) and a a hugetlb mapping.
>>
>> Our current understanding is that hugepd only applies to hugetlb.
>> Wouldn't vmap/vmalloc user ordinary huge pmd entries instead of hugepd?
>>
> 
> 'hugepd' stands for huge page directory. It is independant of whether a
> huge page is used for hugetlb or for anything else, it represents the
> way pages are described in the page tables.

This patch here makes the assumption that hugepd only applies to 
hugetlb, because it removes any such handling from the !hugetlb path in 
GUP. Is that incorrect or are there valid cases where that could happen? 
(init_mm is special in that regard, i don't think it interacts with GUP 
at all).

> 
> I don't know what you mean by _ordinary_ huge pmd entry.
> 

Essentially, what we use for THP. Let me try to understand how hugepd 
interact with the rest of the system.

Do systems that support hugepd currently implement THP? Reading above 
32bit systems below, I assume not?

> Let's take the exemple of powerpc 8xx which is the one I know best. This
> is a powerpc32, so it has two levels : PGD and PTE. PGD has 1024 entries
> and each entry covers a 4Mbytes area. Normal PTE has 1024 entries and
> each entry is a 4k page. When you use 8Mbytes pages, you don't use PTEs
> as it would be a waste of memory. You use a huge page directory that has
> a single entry, and you have two PGD entries pointing to the huge page
> directory.

Thanks, I assume there are no 8MB THP, correct?

The 8MB example with 4MB PGD entries makes it sound a bit like the 
cont-PTE/cont-PMD handling on aarch64: they don't use a hugepd but 
would simply let two consecutive PGD entries point at the the relevant 
(sub) parts of the hugetlb page. No hugepd involved.

> 
> Some time ago, hupgepd was also used for 512kbytes pages and 16kbytes
> pages:
> - there was huge page directories with 8x 512kbytes pages,
> - there was huge page directories with 256x 16kbytes pages,
> 
> And the PGD/PMD entry points to a huge page directory (HUGEPD) instead
> of pointing to a page table directory (PTE).

Thanks for the example.

> 
> Since commit b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as
> standard pages."), the 8xx doesn't use anymore hugepd for 512k huge
> page, but other platforms like powerpc book3e extensively use huge page
> directories.
> 
> I hope this clarifies the subject, otherwise I'm happy to provide
> further details.

Thanks, it would be valuable to know if the assumption in this patch is 
correct: hugepd will only be found in hugetlb areas in ordinary MMs (not 
init_mm).
Christophe Leroy Sept. 5, 2022, 4:05 p.m. UTC | #24
Le 05/09/2022 à 11:46, David Hildenbrand a écrit :
> On 05.09.22 11:33, Christophe Leroy wrote:
>>
>>
>> Le 05/09/2022 à 10:37, David Hildenbrand a écrit :
>>> On 03.09.22 09:07, Christophe Leroy wrote:
>>>> +Resending with valid powerpc list address
>>>>
>>>> Le 02/09/2022 à 20:52, David Hildenbrand a écrit :
>>>>>>>> Adding Christophe on Cc:
>>>>>>>>
>>>>>>>> Christophe do you know if is_hugepd is true for all hugetlb
>>>>>>>> entries, not
>>>>>>>> just hugepd?
>>>>
>>>> is_hugepd() is true if and only if the directory entry points to a huge
>>>> page directory and not to the normal lower level directory.
>>>>
>>>> As far as I understand if the directory entry is not pointing to any
>>>> lower directory but is a huge page entry, pXd_leaf() is true.
>>>>
>>>>
>>>>>>>>
>>>>>>>> On systems without hugepd entries, I guess ptdump skips all
>>>>>>>> hugetlb entries.
>>>>>>>> Sigh!
>>>>
>>>> As far as I can see, ptdump_pXd_entry() handles the pXd_leaf() case.
>>>>
>>>>>>>
>>>>>>> IIUC, the idea of ptdump_walk_pgd() is to dump page tables even
>>>>>>> outside
>>>>>>> VMAs (for debugging purposes?).
>>>>>>>
>>>>>>> I cannot convince myself that that's a good idea when only 
>>>>>>> holding the
>>>>>>> mmap lock in read mode, because we can just see page tables getting
>>>>>>> freed concurrently e.g., during concurrent munmap() ... while 
>>>>>>> holding
>>>>>>> the mmap lock in read we may only walk inside VMA boundaries.
>>>>>>>
>>>>>>> That then raises the questions if we're only calling this on
>>>>>>> special MMs
>>>>>>> (e.g., init_mm) whereby we cannot really see concurrent munmap() and
>>>>>>> where we shouldn't have hugetlb mappings or hugepd entries.
>>>>
>>>> At least on powerpc, PTDUMP handles only init_mm.
>>>>
>>>> Hugepage are used at least on powerpc 8xx for linear memory mapping, 
>>>> see
>>>>
>>>> commit 34536d780683 ("powerpc/8xx: Add a function to early map kernel
>>>> via huge pages")
>>>> commit cf209951fa7f ("powerpc/8xx: Map linear memory with huge pages")
>>>>
>>>> hugepds may also be used in the future to use huge pages for vmap and
>>>> vmalloc, see commit a6a8f7c4aa7e ("powerpc/8xx: add support for huge
>>>> pages on VMAP and VMALLOC")
>>>>
>>>> As far as I know, ppc64 also use huge pages for VMAP and VMALLOC, see
>>>>
>>>> commit d909f9109c30 ("powerpc/64s/radix: Enable HAVE_ARCH_HUGE_VMAP")
>>>> commit 8abddd968a30 ("powerpc/64s/radix: Enable huge vmalloc mappings")
>>>
>>> There is a difference between an ordinary huge mapping (e.g., as used
>>> for THP) and a a hugetlb mapping.
>>>
>>> Our current understanding is that hugepd only applies to hugetlb.
>>> Wouldn't vmap/vmalloc user ordinary huge pmd entries instead of hugepd?
>>>
>>
>> 'hugepd' stands for huge page directory. It is independant of whether a
>> huge page is used for hugetlb or for anything else, it represents the
>> way pages are described in the page tables.
> 
> This patch here makes the assumption that hugepd only applies to 
> hugetlb, because it removes any such handling from the !hugetlb path in 
> GUP. Is that incorrect or are there valid cases where that could happen? 
> (init_mm is special in that regard, i don't think it interacts with GUP 
> at all).

You are correct I think, for user pages hugepd only applies to hugetlb.

> 
>>
>> I don't know what you mean by _ordinary_ huge pmd entry.
>>
> 
> Essentially, what we use for THP. Let me try to understand how hugepd 
> interact with the rest of the system.
> 
> Do systems that support hugepd currently implement THP? Reading above 
> 32bit systems below, I assume not?

Right, as far as I understand only leaf huge pages are handled by THP as 
far as I understand.

> 
>> Let's take the exemple of powerpc 8xx which is the one I know best. This
>> is a powerpc32, so it has two levels : PGD and PTE. PGD has 1024 entries
>> and each entry covers a 4Mbytes area. Normal PTE has 1024 entries and
>> each entry is a 4k page. When you use 8Mbytes pages, you don't use PTEs
>> as it would be a waste of memory. You use a huge page directory that has
>> a single entry, and you have two PGD entries pointing to the huge page
>> directory.
> 
> Thanks, I assume there are no 8MB THP, correct?

Correct.

> 
> The 8MB example with 4MB PGD entries makes it sound a bit like the 
> cont-PTE/cont-PMD handling on aarch64: they don't use a hugepd but would 
> simply let two consecutive PGD entries point at the the relevant (sub) 
> parts of the hugetlb page. No hugepd involved.

Yes it is my feeling as well.

Allthough in the case of the powerpc 8xx we really need a PGD entry + a 
page entry in order to use the hardware assisted page table walk and 
also to populate L1 and L2 TLB entries without to many processing in the 
TLB-miss interrupt handler.

> 
>>
>> Some time ago, hupgepd was also used for 512kbytes pages and 16kbytes
>> pages:
>> - there was huge page directories with 8x 512kbytes pages,
>> - there was huge page directories with 256x 16kbytes pages,
>>
>> And the PGD/PMD entry points to a huge page directory (HUGEPD) instead
>> of pointing to a page table directory (PTE).
> 
> Thanks for the example.
> 
>>
>> Since commit b250c8c08c79 ("powerpc/8xx: Manage 512k huge pages as
>> standard pages."), the 8xx doesn't use anymore hugepd for 512k huge
>> page, but other platforms like powerpc book3e extensively use huge page
>> directories.
>>
>> I hope this clarifies the subject, otherwise I'm happy to provide
>> further details.
> 
> Thanks, it would be valuable to know if the assumption in this patch is 
> correct: hugepd will only be found in hugetlb areas in ordinary MMs (not 
> init_mm).
> 

Yes I think the assumption is correct for user pages hence for GUP.

By the way the discussion started with PTDUMP. For PTDUMP we need huge 
page directories to be taken into account. And for anything that 
involves kernel pages like VMAP or VMALLOC.

Christophe
David Hildenbrand Sept. 5, 2022, 4:09 p.m. UTC | #25
> Yes I think the assumption is correct for user pages hence for GUP.
> 
> By the way the discussion started with PTDUMP. For PTDUMP we need huge
> page directories to be taken into account. And for anything that
> involves kernel pages like VMAP or VMALLOC.

Yes, makes perfect sense to me now that you explained how/where hugepd 
is actually used -- thanks!
diff mbox series

Patch

diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index f993cb36c062..380d2f3966c9 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -91,21 +91,6 @@  int prepare_hugepage_range(struct file *file,
 	return 0;
 }
 
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
-{
-	struct page *page;
-	pte_t *ptep;
-
-	if (REGION_NUMBER(addr) != RGN_HPAGE)
-		return ERR_PTR(-EINVAL);
-
-	ptep = huge_pte_offset(mm, addr, HPAGE_SIZE);
-	if (!ptep || pte_none(*ptep))
-		return NULL;
-	page = pte_page(*ptep);
-	page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
-	return page;
-}
 int pmd_huge(pmd_t pmd)
 {
 	return 0;
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index bc84a594ca62..b0e037c75c12 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -506,43 +506,6 @@  void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 	} while (addr = next, addr != end);
 }
 
-struct page *follow_huge_pd(struct vm_area_struct *vma,
-			    unsigned long address, hugepd_t hpd,
-			    int flags, int pdshift)
-{
-	pte_t *ptep;
-	spinlock_t *ptl;
-	struct page *page = NULL;
-	unsigned long mask;
-	int shift = hugepd_shift(hpd);
-	struct mm_struct *mm = vma->vm_mm;
-
-retry:
-	/*
-	 * hugepage directory entries are protected by mm->page_table_lock
-	 * Use this instead of huge_pte_lockptr
-	 */
-	ptl = &mm->page_table_lock;
-	spin_lock(ptl);
-
-	ptep = hugepte_offset(hpd, address, pdshift);
-	if (pte_present(*ptep)) {
-		mask = (1UL << shift) - 1;
-		page = pte_page(*ptep);
-		page += ((address & mask) >> PAGE_SHIFT);
-		if (flags & FOLL_GET)
-			get_page(page);
-	} else {
-		if (is_hugetlb_entry_migration(*ptep)) {
-			spin_unlock(ptl);
-			__migration_entry_wait(mm, ptep, ptl);
-			goto retry;
-		}
-	}
-	spin_unlock(ptl);
-	return page;
-}
-
 bool __init arch_hugetlb_valid_size(unsigned long size)
 {
 	int shift = __ffs(size);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 852f911d676e..8ea3e5e726e4 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -142,6 +142,8 @@  int move_hugetlb_page_tables(struct vm_area_struct *vma,
 			     unsigned long len);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
 			    struct vm_area_struct *, struct vm_area_struct *);
+struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+				unsigned long address, unsigned int flags);
 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
 			 struct page **, struct vm_area_struct **,
 			 unsigned long *, unsigned long *, long, unsigned int,
@@ -202,17 +204,6 @@  int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep);
 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
 				unsigned long *start, unsigned long *end);
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write);
-struct page *follow_huge_pd(struct vm_area_struct *vma,
-			    unsigned long address, hugepd_t hpd,
-			    int flags, int pdshift);
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-				pmd_t *pmd, int flags);
-struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
-				pud_t *pud, int flags);
-struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
-			     pgd_t *pgd, int flags);
 
 void hugetlb_vma_lock_read(struct vm_area_struct *vma);
 void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
@@ -264,6 +255,13 @@  static inline void adjust_range_if_pmd_sharing_possible(
 {
 }
 
+static struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+				unsigned long address, unsigned int flags)
+{
+	/* should never happen, but do not want to BUG */
+	return ERR_PTR(-EINVAL);
+}
+
 static inline long follow_hugetlb_page(struct mm_struct *mm,
 			struct vm_area_struct *vma, struct page **pages,
 			struct vm_area_struct **vmas, unsigned long *position,
@@ -274,12 +272,6 @@  static inline long follow_hugetlb_page(struct mm_struct *mm,
 	return 0;
 }
 
-static inline struct page *follow_huge_addr(struct mm_struct *mm,
-					unsigned long address, int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 static inline int copy_hugetlb_page_range(struct mm_struct *dst,
 					  struct mm_struct *src,
 					  struct vm_area_struct *dst_vma,
@@ -312,31 +304,6 @@  static inline void hugetlb_show_meminfo_node(int nid)
 {
 }
 
-static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
-				unsigned long address, hugepd_t hpd, int flags,
-				int pdshift)
-{
-	return NULL;
-}
-
-static inline struct page *follow_huge_pmd(struct mm_struct *mm,
-				unsigned long address, pmd_t *pmd, int flags)
-{
-	return NULL;
-}
-
-static inline struct page *follow_huge_pud(struct mm_struct *mm,
-				unsigned long address, pud_t *pud, int flags)
-{
-	return NULL;
-}
-
-static inline struct page *follow_huge_pgd(struct mm_struct *mm,
-				unsigned long address, pgd_t *pgd, int flags)
-{
-	return NULL;
-}
-
 static inline int prepare_hugepage_range(struct file *file,
 				unsigned long addr, unsigned long len)
 {
diff --git a/mm/gup.c b/mm/gup.c
index 66d8619e02ad..80ce04a5bae5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -661,20 +661,6 @@  static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 	pmdval = READ_ONCE(*pmd);
 	if (pmd_none(pmdval))
 		return no_page_table(vma, flags);
-	if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
-		page = follow_huge_pmd(mm, address, pmd, flags);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
-	if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
-		page = follow_huge_pd(vma, address,
-				      __hugepd(pmd_val(pmdval)), flags,
-				      PMD_SHIFT);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
 retry:
 	if (!pmd_present(pmdval)) {
 		/*
@@ -764,20 +750,6 @@  static struct page *follow_pud_mask(struct vm_area_struct *vma,
 	pud = pud_offset(p4dp, address);
 	if (pud_none(*pud))
 		return no_page_table(vma, flags);
-	if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
-		page = follow_huge_pud(mm, address, pud, flags);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
-	if (is_hugepd(__hugepd(pud_val(*pud)))) {
-		page = follow_huge_pd(vma, address,
-				      __hugepd(pud_val(*pud)), flags,
-				      PUD_SHIFT);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
 	if (pud_devmap(*pud)) {
 		ptl = pud_lock(mm, pud);
 		page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
@@ -797,7 +769,6 @@  static struct page *follow_p4d_mask(struct vm_area_struct *vma,
 				    struct follow_page_context *ctx)
 {
 	p4d_t *p4d;
-	struct page *page;
 
 	p4d = p4d_offset(pgdp, address);
 	if (p4d_none(*p4d))
@@ -806,14 +777,6 @@  static struct page *follow_p4d_mask(struct vm_area_struct *vma,
 	if (unlikely(p4d_bad(*p4d)))
 		return no_page_table(vma, flags);
 
-	if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
-		page = follow_huge_pd(vma, address,
-				      __hugepd(p4d_val(*p4d)), flags,
-				      P4D_SHIFT);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
 	return follow_pud_mask(vma, address, p4d, flags, ctx);
 }
 
@@ -851,10 +814,15 @@  static struct page *follow_page_mask(struct vm_area_struct *vma,
 
 	ctx->page_mask = 0;
 
-	/* make this handle hugepd */
-	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
-	if (!IS_ERR(page)) {
-		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
+	/*
+	 * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
+	 * special hugetlb page table walking code.  This eliminates the
+	 * need to check for hugetlb entries in the general walking code.
+	 */
+	if (is_vm_hugetlb_page(vma)) {
+		page = hugetlb_follow_page_mask(vma, address, flags);
+		if (!page)
+			page = no_page_table(vma, flags);
 		return page;
 	}
 
@@ -863,21 +831,6 @@  static struct page *follow_page_mask(struct vm_area_struct *vma,
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 		return no_page_table(vma, flags);
 
-	if (pgd_huge(*pgd)) {
-		page = follow_huge_pgd(mm, address, pgd, flags);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
-	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
-		page = follow_huge_pd(vma, address,
-				      __hugepd(pgd_val(*pgd)), flags,
-				      PGDIR_SHIFT);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
-
 	return follow_p4d_mask(vma, address, pgd, flags, ctx);
 }
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d0617d64d718..b3da421ba5be 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6190,6 +6190,62 @@  static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
 	return false;
 }
 
+struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+				unsigned long address, unsigned int flags)
+{
+	struct hstate *h = hstate_vma(vma);
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long haddr = address & huge_page_mask(h);
+	struct page *page = NULL;
+	spinlock_t *ptl;
+	pte_t *pte, entry;
+
+	/*
+	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
+	 * follow_hugetlb_page().
+	 */
+	if (WARN_ON_ONCE(flags & FOLL_PIN))
+		return NULL;
+
+	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
+	if (!pte)
+		return NULL;
+
+retry:
+	ptl = huge_pte_lock(h, mm, pte);
+	entry = huge_ptep_get(pte);
+	if (pte_present(entry)) {
+		page = pte_page(entry) +
+				((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
+		/*
+		 * Note that page may be a sub-page, and with vmemmap
+		 * optimizations the page struct may be read only.
+		 * try_grab_page() will increase the ref count on the
+		 * head page, so this will be OK.
+		 *
+		 * try_grab_page() should always succeed here, because we hold
+		 * the ptl lock and have verified pte_present().
+		 */
+		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+			page = NULL;
+			goto out;
+		}
+	} else {
+		if (is_hugetlb_entry_migration(entry)) {
+			spin_unlock(ptl);
+			__migration_entry_wait_huge(pte, ptl);
+			goto retry;
+		}
+		/*
+		 * hwpoisoned entry is treated as no_page_table in
+		 * follow_page_mask().
+		 */
+	}
+out:
+	spin_unlock(ptl);
+	return page;
+}
+
 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			 struct page **pages, struct vm_area_struct **vmas,
 			 unsigned long *position, unsigned long *nr_pages,
@@ -7140,123 +7196,6 @@  __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
  * These functions are overwritable if your architecture needs its own
  * behavior.
  */
-struct page * __weak
-follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
-struct page * __weak
-follow_huge_pd(struct vm_area_struct *vma,
-	       unsigned long address, hugepd_t hpd, int flags, int pdshift)
-{
-	WARN(1, "hugepd follow called with no support for hugepage directory format\n");
-	return NULL;
-}
-
-struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int flags)
-{
-	struct page *page = NULL;
-	spinlock_t *ptl;
-	pte_t pte;
-
-	/*
-	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
-	 * follow_hugetlb_page().
-	 */
-	if (WARN_ON_ONCE(flags & FOLL_PIN))
-		return NULL;
-
-retry:
-	ptl = pmd_lockptr(mm, pmd);
-	spin_lock(ptl);
-	/*
-	 * make sure that the address range covered by this pmd is not
-	 * unmapped from other threads.
-	 */
-	if (!pmd_huge(*pmd))
-		goto out;
-	pte = huge_ptep_get((pte_t *)pmd);
-	if (pte_present(pte)) {
-		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
-		/*
-		 * try_grab_page() should always succeed here, because: a) we
-		 * hold the pmd (ptl) lock, and b) we've just checked that the
-		 * huge pmd (head) page is present in the page tables. The ptl
-		 * prevents the head page and tail pages from being rearranged
-		 * in any way. So this page must be available at this point,
-		 * unless the page refcount overflowed:
-		 */
-		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
-			page = NULL;
-			goto out;
-		}
-	} else {
-		if (is_hugetlb_entry_migration(pte)) {
-			spin_unlock(ptl);
-			__migration_entry_wait_huge((pte_t *)pmd, ptl);
-			goto retry;
-		}
-		/*
-		 * hwpoisoned entry is treated as no_page_table in
-		 * follow_page_mask().
-		 */
-	}
-out:
-	spin_unlock(ptl);
-	return page;
-}
-
-struct page * __weak
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
-		pud_t *pud, int flags)
-{
-	struct page *page = NULL;
-	spinlock_t *ptl;
-	pte_t pte;
-
-	if (WARN_ON_ONCE(flags & FOLL_PIN))
-		return NULL;
-
-retry:
-	ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
-	if (!pud_huge(*pud))
-		goto out;
-	pte = huge_ptep_get((pte_t *)pud);
-	if (pte_present(pte)) {
-		page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
-		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
-			page = NULL;
-			goto out;
-		}
-	} else {
-		if (is_hugetlb_entry_migration(pte)) {
-			spin_unlock(ptl);
-			__migration_entry_wait(mm, (pte_t *)pud, ptl);
-			goto retry;
-		}
-		/*
-		 * hwpoisoned entry is treated as no_page_table in
-		 * follow_page_mask().
-		 */
-	}
-out:
-	spin_unlock(ptl);
-	return page;
-}
-
-struct page * __weak
-follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
-{
-	if (flags & (FOLL_GET | FOLL_PIN))
-		return NULL;
-
-	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
-}
-
 int isolate_hugetlb(struct page *page, struct list_head *list)
 {
 	int ret = 0;