diff mbox series

[v2,1/5] mm/hugetlb: fix races when looking up a CONT-PTE size hugetlb page

Message ID 0e5d92da043d147a867f634b17acbcc97a7f0e64.1661240170.git.baolin.wang@linux.alibaba.com (mailing list archive)
State New
Headers show
Series Fix some issues when looking up hugetlb page | expand

Commit Message

Baolin Wang Aug. 23, 2022, 7:50 a.m. UTC
On some architectures (like ARM64), it can support CONT-PTE/PMD size
hugetlb, which means it can support not only PMD/PUD size hugetlb
(2M and 1G), but also CONT-PTE/PMD size(64K and 32M) if a 4K page size
specified.

So when looking up a CONT-PTE size hugetlb page by follow_page(), it
will use pte_offset_map_lock() to get the pte entry lock for the CONT-PTE
size hugetlb in follow_page_pte(). However this pte entry lock is incorrect
for the CONT-PTE size hugetlb, since we should use huge_pte_lock() to
get the correct lock, which is mm->page_table_lock.

That means the pte entry of the CONT-PTE size hugetlb under current
pte lock is unstable in follow_page_pte(), we can continue to migrate
or poison the pte entry of the CONT-PTE size hugetlb, which can cause
some potential race issues, and following pte_xxx() validation is also
unstable in follow_page_pte(), even though they are under the 'pte lock'.

Moreover we should use huge_ptep_get() to get the pte entry value of
the CONT-PTE size hugetlb, which already takes into account the subpages'
dirty or young bits in case we missed the dirty or young state of the
CONT-PTE size hugetlb.

To fix above issues, introducing a new helper follow_huge_pte() to look
up a CONT-PTE size hugetlb page, which uses huge_pte_lock() to get the
correct pte entry lock to make the pte entry stable, as well as
supporting non-present pte handling.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
---
 include/linux/hugetlb.h |  8 ++++++++
 mm/gup.c                | 11 ++++++++++
 mm/hugetlb.c            | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+)

Comments

David Hildenbrand Aug. 23, 2022, 8:29 a.m. UTC | #1
On 23.08.22 09:50, Baolin Wang wrote:
> On some architectures (like ARM64), it can support CONT-PTE/PMD size
> hugetlb, which means it can support not only PMD/PUD size hugetlb
> (2M and 1G), but also CONT-PTE/PMD size(64K and 32M) if a 4K page size
> specified.
> 
> So when looking up a CONT-PTE size hugetlb page by follow_page(), it
> will use pte_offset_map_lock() to get the pte entry lock for the CONT-PTE
> size hugetlb in follow_page_pte(). However this pte entry lock is incorrect
> for the CONT-PTE size hugetlb, since we should use huge_pte_lock() to
> get the correct lock, which is mm->page_table_lock.
> 
> That means the pte entry of the CONT-PTE size hugetlb under current
> pte lock is unstable in follow_page_pte(), we can continue to migrate
> or poison the pte entry of the CONT-PTE size hugetlb, which can cause
> some potential race issues, and following pte_xxx() validation is also
> unstable in follow_page_pte(), even though they are under the 'pte lock'.
> 
> Moreover we should use huge_ptep_get() to get the pte entry value of
> the CONT-PTE size hugetlb, which already takes into account the subpages'
> dirty or young bits in case we missed the dirty or young state of the
> CONT-PTE size hugetlb.
> 
> To fix above issues, introducing a new helper follow_huge_pte() to look
> up a CONT-PTE size hugetlb page, which uses huge_pte_lock() to get the
> correct pte entry lock to make the pte entry stable, as well as
> supporting non-present pte handling.
> 
> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> ---
>  include/linux/hugetlb.h |  8 ++++++++
>  mm/gup.c                | 11 ++++++++++
>  mm/hugetlb.c            | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 72 insertions(+)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 3ec981a..d491138 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -207,6 +207,8 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
>  struct page *follow_huge_pd(struct vm_area_struct *vma,
>  			    unsigned long address, hugepd_t hpd,
>  			    int flags, int pdshift);
> +struct page *follow_huge_pte(struct vm_area_struct *vma, unsigned long address,
> +			     pmd_t *pmd, int flags);
>  struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>  				pmd_t *pmd, int flags);
>  struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
> @@ -312,6 +314,12 @@ static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
>  	return NULL;
>  }
>  
> +static inline struct page *follow_huge_pte(struct vm_area_struct *vma,
> +				unsigned long address, pmd_t *pmd, int flags)
> +{
> +	return NULL;
> +}
> +
>  static inline struct page *follow_huge_pmd(struct mm_struct *mm,
>  				unsigned long address, pmd_t *pmd, int flags)
>  {
> diff --git a/mm/gup.c b/mm/gup.c
> index 3b656b7..87a94f5 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -534,6 +534,17 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
>  	if (unlikely(pmd_bad(*pmd)))
>  		return no_page_table(vma, flags);
>  
> +	/*
> +	 * Considering PTE level hugetlb, like continuous-PTE hugetlb on
> +	 * ARM64 architecture.
> +	 */
> +	if (is_vm_hugetlb_page(vma)) {
> +		page = follow_huge_pte(vma, address, pmd, flags);
> +		if (page)
> +			return page;
> +		return no_page_table(vma, flags);
> +	}
> +
>  	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
>  	pte = *ptep;
>  	if (!pte_present(pte)) {
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 6c00ba1..cf742d1 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6981,6 +6981,59 @@ struct page * __weak
>  	return NULL;
>  }
>  
> +/* Support looking up a CONT-PTE size hugetlb page. */
> +struct page * __weak
> +follow_huge_pte(struct vm_area_struct *vma, unsigned long address,
> +		pmd_t *pmd, int flags)
> +{
> +	struct mm_struct *mm = vma->vm_mm;
> +	struct hstate *hstate = hstate_vma(vma);
> +	unsigned long size = huge_page_size(hstate);
> +	struct page *page = NULL;
> +	spinlock_t *ptl;
> +	pte_t *ptep, pte;
> +
> +	/*
> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> +	 * follow_hugetlb_page().
> +	 */
> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
> +		return NULL;
> +
> +	ptep = huge_pte_offset(mm, address, size);
> +	if (!ptep)
> +		return NULL;
> +
> +retry:
> +	ptl = huge_pte_lock(hstate, mm, ptep);
> +	pte = huge_ptep_get(ptep);
> +	if (pte_present(pte)) {
> +		page = pte_page(pte);
> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> +			page = NULL;
> +			goto out;
> +		}
> +	} else {
> +		if (!(flags & FOLL_MIGRATION)) {
> +			page = NULL;
> +			goto out;
> +		}
> +
> +		if (is_hugetlb_entry_migration(pte)) {
> +			spin_unlock(ptl);
> +			__migration_entry_wait_huge(ptep, ptl);
> +			goto retry;
> +		}
> +		/*
> +		 * hwpoisoned entry is treated as no_page_table in
> +		 * follow_page_mask().
> +		 */
> +	}
> +out:
> +	spin_unlock(ptl);
> +	return page;
> +}
> +
>  struct page * __weak
>  follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>  		pmd_t *pmd, int flags)


Can someone explain why:
* follow_page() goes via follow_page_mask() for hugetlb
* __get_user_pages() goes via follow_hugetlb_page() and never via
  follow_page_mask() for hugetlb?

IOW, why can't we make follow_page_mask() just not handle hugetlb and
route everything via follow_hugetlb_page() -- we primarily only have to
teach it to not trigger faults.


What's the reason that this hugetlb code has to be overly complicated?
Baolin Wang Aug. 23, 2022, 10:02 a.m. UTC | #2
On 8/23/2022 4:29 PM, David Hildenbrand wrote:
> On 23.08.22 09:50, Baolin Wang wrote:
>> On some architectures (like ARM64), it can support CONT-PTE/PMD size
>> hugetlb, which means it can support not only PMD/PUD size hugetlb
>> (2M and 1G), but also CONT-PTE/PMD size(64K and 32M) if a 4K page size
>> specified.
>>
>> So when looking up a CONT-PTE size hugetlb page by follow_page(), it
>> will use pte_offset_map_lock() to get the pte entry lock for the CONT-PTE
>> size hugetlb in follow_page_pte(). However this pte entry lock is incorrect
>> for the CONT-PTE size hugetlb, since we should use huge_pte_lock() to
>> get the correct lock, which is mm->page_table_lock.
>>
>> That means the pte entry of the CONT-PTE size hugetlb under current
>> pte lock is unstable in follow_page_pte(), we can continue to migrate
>> or poison the pte entry of the CONT-PTE size hugetlb, which can cause
>> some potential race issues, and following pte_xxx() validation is also
>> unstable in follow_page_pte(), even though they are under the 'pte lock'.
>>
>> Moreover we should use huge_ptep_get() to get the pte entry value of
>> the CONT-PTE size hugetlb, which already takes into account the subpages'
>> dirty or young bits in case we missed the dirty or young state of the
>> CONT-PTE size hugetlb.
>>
>> To fix above issues, introducing a new helper follow_huge_pte() to look
>> up a CONT-PTE size hugetlb page, which uses huge_pte_lock() to get the
>> correct pte entry lock to make the pte entry stable, as well as
>> supporting non-present pte handling.
>>
>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>> ---
>>   include/linux/hugetlb.h |  8 ++++++++
>>   mm/gup.c                | 11 ++++++++++
>>   mm/hugetlb.c            | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 72 insertions(+)
>>
>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>> index 3ec981a..d491138 100644
>> --- a/include/linux/hugetlb.h
>> +++ b/include/linux/hugetlb.h
>> @@ -207,6 +207,8 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
>>   struct page *follow_huge_pd(struct vm_area_struct *vma,
>>   			    unsigned long address, hugepd_t hpd,
>>   			    int flags, int pdshift);
>> +struct page *follow_huge_pte(struct vm_area_struct *vma, unsigned long address,
>> +			     pmd_t *pmd, int flags);
>>   struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>>   				pmd_t *pmd, int flags);
>>   struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
>> @@ -312,6 +314,12 @@ static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
>>   	return NULL;
>>   }
>>   
>> +static inline struct page *follow_huge_pte(struct vm_area_struct *vma,
>> +				unsigned long address, pmd_t *pmd, int flags)
>> +{
>> +	return NULL;
>> +}
>> +
>>   static inline struct page *follow_huge_pmd(struct mm_struct *mm,
>>   				unsigned long address, pmd_t *pmd, int flags)
>>   {
>> diff --git a/mm/gup.c b/mm/gup.c
>> index 3b656b7..87a94f5 100644
>> --- a/mm/gup.c
>> +++ b/mm/gup.c
>> @@ -534,6 +534,17 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
>>   	if (unlikely(pmd_bad(*pmd)))
>>   		return no_page_table(vma, flags);
>>   
>> +	/*
>> +	 * Considering PTE level hugetlb, like continuous-PTE hugetlb on
>> +	 * ARM64 architecture.
>> +	 */
>> +	if (is_vm_hugetlb_page(vma)) {
>> +		page = follow_huge_pte(vma, address, pmd, flags);
>> +		if (page)
>> +			return page;
>> +		return no_page_table(vma, flags);
>> +	}
>> +
>>   	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
>>   	pte = *ptep;
>>   	if (!pte_present(pte)) {
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 6c00ba1..cf742d1 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -6981,6 +6981,59 @@ struct page * __weak
>>   	return NULL;
>>   }
>>   
>> +/* Support looking up a CONT-PTE size hugetlb page. */
>> +struct page * __weak
>> +follow_huge_pte(struct vm_area_struct *vma, unsigned long address,
>> +		pmd_t *pmd, int flags)
>> +{
>> +	struct mm_struct *mm = vma->vm_mm;
>> +	struct hstate *hstate = hstate_vma(vma);
>> +	unsigned long size = huge_page_size(hstate);
>> +	struct page *page = NULL;
>> +	spinlock_t *ptl;
>> +	pte_t *ptep, pte;
>> +
>> +	/*
>> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
>> +	 * follow_hugetlb_page().
>> +	 */
>> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
>> +		return NULL;
>> +
>> +	ptep = huge_pte_offset(mm, address, size);
>> +	if (!ptep)
>> +		return NULL;
>> +
>> +retry:
>> +	ptl = huge_pte_lock(hstate, mm, ptep);
>> +	pte = huge_ptep_get(ptep);
>> +	if (pte_present(pte)) {
>> +		page = pte_page(pte);
>> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
>> +			page = NULL;
>> +			goto out;
>> +		}
>> +	} else {
>> +		if (!(flags & FOLL_MIGRATION)) {
>> +			page = NULL;
>> +			goto out;
>> +		}
>> +
>> +		if (is_hugetlb_entry_migration(pte)) {
>> +			spin_unlock(ptl);
>> +			__migration_entry_wait_huge(ptep, ptl);
>> +			goto retry;
>> +		}
>> +		/*
>> +		 * hwpoisoned entry is treated as no_page_table in
>> +		 * follow_page_mask().
>> +		 */
>> +	}
>> +out:
>> +	spin_unlock(ptl);
>> +	return page;
>> +}
>> +
>>   struct page * __weak
>>   follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>>   		pmd_t *pmd, int flags)
> 
> 
> Can someone explain why:
> * follow_page() goes via follow_page_mask() for hugetlb
> * __get_user_pages() goes via follow_hugetlb_page() and never via
>    follow_page_mask() for hugetlb?
> 
> IOW, why can't we make follow_page_mask() just not handle hugetlb and
> route everything via follow_hugetlb_page() -- we primarily only have to
> teach it to not trigger faults.

IMHO, these follow_huge_xxx() functions are arch-specified at first and 
were moved into the common hugetlb.c by commit 9e5fc74c3025 ("mm: 
hugetlb: Copy general hugetlb code from x86 to mm"), and now there are 
still some arch-specified follow_huge_xxx() definition, for example:
ia64: follow_huge_addr
powerpc: follow_huge_pd
s390: follow_huge_pud

What I mean is that follow_hugetlb_page() is a common and 
not-arch-specified function, is it suitable to change it to be 
arch-specified?
And thinking more, can we rename follow_hugetlb_page() as 
hugetlb_page_faultin() and simplify it to only handle the page faults of 
hugetlb like the faultin_page() for normal page? That means we can make 
sure only follow_page_mask() can handle hugetlb.

Mike, Muchun, please correct me if I missed something. Thanks.

> What's the reason that this hugetlb code has to be overly complicated?
David Hildenbrand Aug. 23, 2022, 10:23 a.m. UTC | #3
On 23.08.22 12:02, Baolin Wang wrote:
> 
> 
> On 8/23/2022 4:29 PM, David Hildenbrand wrote:
>> On 23.08.22 09:50, Baolin Wang wrote:
>>> On some architectures (like ARM64), it can support CONT-PTE/PMD size
>>> hugetlb, which means it can support not only PMD/PUD size hugetlb
>>> (2M and 1G), but also CONT-PTE/PMD size(64K and 32M) if a 4K page size
>>> specified.
>>>
>>> So when looking up a CONT-PTE size hugetlb page by follow_page(), it
>>> will use pte_offset_map_lock() to get the pte entry lock for the CONT-PTE
>>> size hugetlb in follow_page_pte(). However this pte entry lock is incorrect
>>> for the CONT-PTE size hugetlb, since we should use huge_pte_lock() to
>>> get the correct lock, which is mm->page_table_lock.
>>>
>>> That means the pte entry of the CONT-PTE size hugetlb under current
>>> pte lock is unstable in follow_page_pte(), we can continue to migrate
>>> or poison the pte entry of the CONT-PTE size hugetlb, which can cause
>>> some potential race issues, and following pte_xxx() validation is also
>>> unstable in follow_page_pte(), even though they are under the 'pte lock'.
>>>
>>> Moreover we should use huge_ptep_get() to get the pte entry value of
>>> the CONT-PTE size hugetlb, which already takes into account the subpages'
>>> dirty or young bits in case we missed the dirty or young state of the
>>> CONT-PTE size hugetlb.
>>>
>>> To fix above issues, introducing a new helper follow_huge_pte() to look
>>> up a CONT-PTE size hugetlb page, which uses huge_pte_lock() to get the
>>> correct pte entry lock to make the pte entry stable, as well as
>>> supporting non-present pte handling.
>>>
>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>> ---
>>>   include/linux/hugetlb.h |  8 ++++++++
>>>   mm/gup.c                | 11 ++++++++++
>>>   mm/hugetlb.c            | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>   3 files changed, 72 insertions(+)
>>>
>>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>>> index 3ec981a..d491138 100644
>>> --- a/include/linux/hugetlb.h
>>> +++ b/include/linux/hugetlb.h
>>> @@ -207,6 +207,8 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
>>>   struct page *follow_huge_pd(struct vm_area_struct *vma,
>>>   			    unsigned long address, hugepd_t hpd,
>>>   			    int flags, int pdshift);
>>> +struct page *follow_huge_pte(struct vm_area_struct *vma, unsigned long address,
>>> +			     pmd_t *pmd, int flags);
>>>   struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>>>   				pmd_t *pmd, int flags);
>>>   struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
>>> @@ -312,6 +314,12 @@ static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
>>>   	return NULL;
>>>   }
>>>   
>>> +static inline struct page *follow_huge_pte(struct vm_area_struct *vma,
>>> +				unsigned long address, pmd_t *pmd, int flags)
>>> +{
>>> +	return NULL;
>>> +}
>>> +
>>>   static inline struct page *follow_huge_pmd(struct mm_struct *mm,
>>>   				unsigned long address, pmd_t *pmd, int flags)
>>>   {
>>> diff --git a/mm/gup.c b/mm/gup.c
>>> index 3b656b7..87a94f5 100644
>>> --- a/mm/gup.c
>>> +++ b/mm/gup.c
>>> @@ -534,6 +534,17 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
>>>   	if (unlikely(pmd_bad(*pmd)))
>>>   		return no_page_table(vma, flags);
>>>   
>>> +	/*
>>> +	 * Considering PTE level hugetlb, like continuous-PTE hugetlb on
>>> +	 * ARM64 architecture.
>>> +	 */
>>> +	if (is_vm_hugetlb_page(vma)) {
>>> +		page = follow_huge_pte(vma, address, pmd, flags);
>>> +		if (page)
>>> +			return page;
>>> +		return no_page_table(vma, flags);
>>> +	}
>>> +
>>>   	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
>>>   	pte = *ptep;
>>>   	if (!pte_present(pte)) {
>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>> index 6c00ba1..cf742d1 100644
>>> --- a/mm/hugetlb.c
>>> +++ b/mm/hugetlb.c
>>> @@ -6981,6 +6981,59 @@ struct page * __weak
>>>   	return NULL;
>>>   }
>>>   
>>> +/* Support looking up a CONT-PTE size hugetlb page. */
>>> +struct page * __weak
>>> +follow_huge_pte(struct vm_area_struct *vma, unsigned long address,
>>> +		pmd_t *pmd, int flags)
>>> +{
>>> +	struct mm_struct *mm = vma->vm_mm;
>>> +	struct hstate *hstate = hstate_vma(vma);
>>> +	unsigned long size = huge_page_size(hstate);
>>> +	struct page *page = NULL;
>>> +	spinlock_t *ptl;
>>> +	pte_t *ptep, pte;
>>> +
>>> +	/*
>>> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
>>> +	 * follow_hugetlb_page().
>>> +	 */
>>> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
>>> +		return NULL;
>>> +
>>> +	ptep = huge_pte_offset(mm, address, size);
>>> +	if (!ptep)
>>> +		return NULL;
>>> +
>>> +retry:
>>> +	ptl = huge_pte_lock(hstate, mm, ptep);
>>> +	pte = huge_ptep_get(ptep);
>>> +	if (pte_present(pte)) {
>>> +		page = pte_page(pte);
>>> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
>>> +			page = NULL;
>>> +			goto out;
>>> +		}
>>> +	} else {
>>> +		if (!(flags & FOLL_MIGRATION)) {
>>> +			page = NULL;
>>> +			goto out;
>>> +		}
>>> +
>>> +		if (is_hugetlb_entry_migration(pte)) {
>>> +			spin_unlock(ptl);
>>> +			__migration_entry_wait_huge(ptep, ptl);
>>> +			goto retry;
>>> +		}
>>> +		/*
>>> +		 * hwpoisoned entry is treated as no_page_table in
>>> +		 * follow_page_mask().
>>> +		 */
>>> +	}
>>> +out:
>>> +	spin_unlock(ptl);
>>> +	return page;
>>> +}
>>> +
>>>   struct page * __weak
>>>   follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>>>   		pmd_t *pmd, int flags)
>>
>>
>> Can someone explain why:
>> * follow_page() goes via follow_page_mask() for hugetlb
>> * __get_user_pages() goes via follow_hugetlb_page() and never via
>>    follow_page_mask() for hugetlb?
>>
>> IOW, why can't we make follow_page_mask() just not handle hugetlb and
>> route everything via follow_hugetlb_page() -- we primarily only have to
>> teach it to not trigger faults.
> 
> IMHO, these follow_huge_xxx() functions are arch-specified at first and 
> were moved into the common hugetlb.c by commit 9e5fc74c3025 ("mm: 
> hugetlb: Copy general hugetlb code from x86 to mm"), and now there are 
> still some arch-specified follow_huge_xxx() definition, for example:
> ia64: follow_huge_addr
> powerpc: follow_huge_pd
> s390: follow_huge_pud
> 
> What I mean is that follow_hugetlb_page() is a common and 
> not-arch-specified function, is it suitable to change it to be 
> arch-specified?
> And thinking more, can we rename follow_hugetlb_page() as 
> hugetlb_page_faultin() and simplify it to only handle the page faults of 
> hugetlb like the faultin_page() for normal page? That means we can make 
> sure only follow_page_mask() can handle hugetlb.
> 

If follow_hugetlb_page() can be arch-independent, why do we need the
other arch-dependent functions?

It all looks a bit weird to have two functions that walk page tables and
are hugetlb aware.

Either this screams for a cleanup or I am missing something fundamental.
Mike Kravetz Aug. 23, 2022, 11:55 p.m. UTC | #4
On 08/23/22 12:23, David Hildenbrand wrote:
> On 23.08.22 12:02, Baolin Wang wrote:
> > 
> > 
> > On 8/23/2022 4:29 PM, David Hildenbrand wrote:
> >> On 23.08.22 09:50, Baolin Wang wrote:
> >>> On some architectures (like ARM64), it can support CONT-PTE/PMD size
> >>> hugetlb, which means it can support not only PMD/PUD size hugetlb
> >>> (2M and 1G), but also CONT-PTE/PMD size(64K and 32M) if a 4K page size
> >>> specified.
> >>>
> >>> So when looking up a CONT-PTE size hugetlb page by follow_page(), it
> >>> will use pte_offset_map_lock() to get the pte entry lock for the CONT-PTE
> >>> size hugetlb in follow_page_pte(). However this pte entry lock is incorrect
> >>> for the CONT-PTE size hugetlb, since we should use huge_pte_lock() to
> >>> get the correct lock, which is mm->page_table_lock.
> >>>
> >>> That means the pte entry of the CONT-PTE size hugetlb under current
> >>> pte lock is unstable in follow_page_pte(), we can continue to migrate
> >>> or poison the pte entry of the CONT-PTE size hugetlb, which can cause
> >>> some potential race issues, and following pte_xxx() validation is also
> >>> unstable in follow_page_pte(), even though they are under the 'pte lock'.
> >>>
> >>> Moreover we should use huge_ptep_get() to get the pte entry value of
> >>> the CONT-PTE size hugetlb, which already takes into account the subpages'
> >>> dirty or young bits in case we missed the dirty or young state of the
> >>> CONT-PTE size hugetlb.
> >>>
> >>> To fix above issues, introducing a new helper follow_huge_pte() to look
> >>> up a CONT-PTE size hugetlb page, which uses huge_pte_lock() to get the
> >>> correct pte entry lock to make the pte entry stable, as well as
> >>> supporting non-present pte handling.
> >>>
> >>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> >>> ---
> >>>   include/linux/hugetlb.h |  8 ++++++++
> >>>   mm/gup.c                | 11 ++++++++++
> >>>   mm/hugetlb.c            | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>>   3 files changed, 72 insertions(+)
> >>>
> >>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> >>> index 3ec981a..d491138 100644
> >>> --- a/include/linux/hugetlb.h
> >>> +++ b/include/linux/hugetlb.h
> >>> @@ -207,6 +207,8 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
> >>>   struct page *follow_huge_pd(struct vm_area_struct *vma,
> >>>   			    unsigned long address, hugepd_t hpd,
> >>>   			    int flags, int pdshift);
> >>> +struct page *follow_huge_pte(struct vm_area_struct *vma, unsigned long address,
> >>> +			     pmd_t *pmd, int flags);
> >>>   struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
> >>>   				pmd_t *pmd, int flags);
> >>>   struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
> >>> @@ -312,6 +314,12 @@ static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
> >>>   	return NULL;
> >>>   }
> >>>   
> >>> +static inline struct page *follow_huge_pte(struct vm_area_struct *vma,
> >>> +				unsigned long address, pmd_t *pmd, int flags)
> >>> +{
> >>> +	return NULL;
> >>> +}
> >>> +
> >>>   static inline struct page *follow_huge_pmd(struct mm_struct *mm,
> >>>   				unsigned long address, pmd_t *pmd, int flags)
> >>>   {
> >>> diff --git a/mm/gup.c b/mm/gup.c
> >>> index 3b656b7..87a94f5 100644
> >>> --- a/mm/gup.c
> >>> +++ b/mm/gup.c
> >>> @@ -534,6 +534,17 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
> >>>   	if (unlikely(pmd_bad(*pmd)))
> >>>   		return no_page_table(vma, flags);
> >>>   
> >>> +	/*
> >>> +	 * Considering PTE level hugetlb, like continuous-PTE hugetlb on
> >>> +	 * ARM64 architecture.
> >>> +	 */
> >>> +	if (is_vm_hugetlb_page(vma)) {
> >>> +		page = follow_huge_pte(vma, address, pmd, flags);
> >>> +		if (page)
> >>> +			return page;
> >>> +		return no_page_table(vma, flags);
> >>> +	}
> >>> +
> >>>   	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
> >>>   	pte = *ptep;
> >>>   	if (!pte_present(pte)) {
> >>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> >>> index 6c00ba1..cf742d1 100644
> >>> --- a/mm/hugetlb.c
> >>> +++ b/mm/hugetlb.c
> >>> @@ -6981,6 +6981,59 @@ struct page * __weak
> >>>   	return NULL;
> >>>   }
> >>>   
> >>> +/* Support looking up a CONT-PTE size hugetlb page. */
> >>> +struct page * __weak
> >>> +follow_huge_pte(struct vm_area_struct *vma, unsigned long address,
> >>> +		pmd_t *pmd, int flags)
> >>> +{
> >>> +	struct mm_struct *mm = vma->vm_mm;
> >>> +	struct hstate *hstate = hstate_vma(vma);
> >>> +	unsigned long size = huge_page_size(hstate);
> >>> +	struct page *page = NULL;
> >>> +	spinlock_t *ptl;
> >>> +	pte_t *ptep, pte;
> >>> +
> >>> +	/*
> >>> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> >>> +	 * follow_hugetlb_page().
> >>> +	 */
> >>> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
> >>> +		return NULL;
> >>> +
> >>> +	ptep = huge_pte_offset(mm, address, size);
> >>> +	if (!ptep)
> >>> +		return NULL;
> >>> +
> >>> +retry:
> >>> +	ptl = huge_pte_lock(hstate, mm, ptep);
> >>> +	pte = huge_ptep_get(ptep);
> >>> +	if (pte_present(pte)) {
> >>> +		page = pte_page(pte);
> >>> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> >>> +			page = NULL;
> >>> +			goto out;
> >>> +		}
> >>> +	} else {
> >>> +		if (!(flags & FOLL_MIGRATION)) {
> >>> +			page = NULL;
> >>> +			goto out;
> >>> +		}
> >>> +
> >>> +		if (is_hugetlb_entry_migration(pte)) {
> >>> +			spin_unlock(ptl);
> >>> +			__migration_entry_wait_huge(ptep, ptl);
> >>> +			goto retry;
> >>> +		}
> >>> +		/*
> >>> +		 * hwpoisoned entry is treated as no_page_table in
> >>> +		 * follow_page_mask().
> >>> +		 */
> >>> +	}
> >>> +out:
> >>> +	spin_unlock(ptl);
> >>> +	return page;
> >>> +}
> >>> +
> >>>   struct page * __weak
> >>>   follow_huge_pmd(struct mm_struct *mm, unsigned long address,
> >>>   		pmd_t *pmd, int flags)
> >>
> >>
> >> Can someone explain why:
> >> * follow_page() goes via follow_page_mask() for hugetlb
> >> * __get_user_pages() goes via follow_hugetlb_page() and never via
> >>    follow_page_mask() for hugetlb?
> >>
> >> IOW, why can't we make follow_page_mask() just not handle hugetlb and
> >> route everything via follow_hugetlb_page() -- we primarily only have to
> >> teach it to not trigger faults.

I have no idea how we got into this situation, and do agree that it
makes little sense for both follow_page_mask and follow_hugetlb_page to
do page table walking differently for hugetlb pages.

I think I have noted elsewhere that all those follow_huge_p*d rotines
will look the same.  It seems they were just added as needed when the
follow_page_mask page table walking code was fleshed out.  This also
needs a cleanup.  If we eliminate hugetlb handling from follow_page_mask,
perhaps we can get rid of all these?

> > 
> > IMHO, these follow_huge_xxx() functions are arch-specified at first and 
> > were moved into the common hugetlb.c by commit 9e5fc74c3025 ("mm: 
> > hugetlb: Copy general hugetlb code from x86 to mm"), and now there are 
> > still some arch-specified follow_huge_xxx() definition, for example:
> > ia64: follow_huge_addr
> > powerpc: follow_huge_pd
> > s390: follow_huge_pud
> > 
> > What I mean is that follow_hugetlb_page() is a common and 
> > not-arch-specified function, is it suitable to change it to be 
> > arch-specified?
> > And thinking more, can we rename follow_hugetlb_page() as 
> > hugetlb_page_faultin() and simplify it to only handle the page faults of 
> > hugetlb like the faultin_page() for normal page? That means we can make 
> > sure only follow_page_mask() can handle hugetlb.
> > 

Something like that might work, but you still have two page table walkers
for hugetlb.  I like David's idea (if I understand it correctly) of
using follow_hugetlb_page for both cases.  As noted, it will need to be
taught how to not trigger faults in the follow_page_mask case.
Baolin Wang Aug. 24, 2022, 2:06 a.m. UTC | #5
On 8/24/2022 7:55 AM, Mike Kravetz wrote:
> On 08/23/22 12:23, David Hildenbrand wrote:
>> On 23.08.22 12:02, Baolin Wang wrote:
>>>
>>>
>>> On 8/23/2022 4:29 PM, David Hildenbrand wrote:
>>>> On 23.08.22 09:50, Baolin Wang wrote:
>>>>> On some architectures (like ARM64), it can support CONT-PTE/PMD size
>>>>> hugetlb, which means it can support not only PMD/PUD size hugetlb
>>>>> (2M and 1G), but also CONT-PTE/PMD size(64K and 32M) if a 4K page size
>>>>> specified.
>>>>>
>>>>> So when looking up a CONT-PTE size hugetlb page by follow_page(), it
>>>>> will use pte_offset_map_lock() to get the pte entry lock for the CONT-PTE
>>>>> size hugetlb in follow_page_pte(). However this pte entry lock is incorrect
>>>>> for the CONT-PTE size hugetlb, since we should use huge_pte_lock() to
>>>>> get the correct lock, which is mm->page_table_lock.
>>>>>
>>>>> That means the pte entry of the CONT-PTE size hugetlb under current
>>>>> pte lock is unstable in follow_page_pte(), we can continue to migrate
>>>>> or poison the pte entry of the CONT-PTE size hugetlb, which can cause
>>>>> some potential race issues, and following pte_xxx() validation is also
>>>>> unstable in follow_page_pte(), even though they are under the 'pte lock'.
>>>>>
>>>>> Moreover we should use huge_ptep_get() to get the pte entry value of
>>>>> the CONT-PTE size hugetlb, which already takes into account the subpages'
>>>>> dirty or young bits in case we missed the dirty or young state of the
>>>>> CONT-PTE size hugetlb.
>>>>>
>>>>> To fix above issues, introducing a new helper follow_huge_pte() to look
>>>>> up a CONT-PTE size hugetlb page, which uses huge_pte_lock() to get the
>>>>> correct pte entry lock to make the pte entry stable, as well as
>>>>> supporting non-present pte handling.
>>>>>
>>>>> Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>>> ---
>>>>>    include/linux/hugetlb.h |  8 ++++++++
>>>>>    mm/gup.c                | 11 ++++++++++
>>>>>    mm/hugetlb.c            | 53 +++++++++++++++++++++++++++++++++++++++++++++++++
>>>>>    3 files changed, 72 insertions(+)
>>>>>
>>>>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>>>>> index 3ec981a..d491138 100644
>>>>> --- a/include/linux/hugetlb.h
>>>>> +++ b/include/linux/hugetlb.h
>>>>> @@ -207,6 +207,8 @@ struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
>>>>>    struct page *follow_huge_pd(struct vm_area_struct *vma,
>>>>>    			    unsigned long address, hugepd_t hpd,
>>>>>    			    int flags, int pdshift);
>>>>> +struct page *follow_huge_pte(struct vm_area_struct *vma, unsigned long address,
>>>>> +			     pmd_t *pmd, int flags);
>>>>>    struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>>>>>    				pmd_t *pmd, int flags);
>>>>>    struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
>>>>> @@ -312,6 +314,12 @@ static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
>>>>>    	return NULL;
>>>>>    }
>>>>>    
>>>>> +static inline struct page *follow_huge_pte(struct vm_area_struct *vma,
>>>>> +				unsigned long address, pmd_t *pmd, int flags)
>>>>> +{
>>>>> +	return NULL;
>>>>> +}
>>>>> +
>>>>>    static inline struct page *follow_huge_pmd(struct mm_struct *mm,
>>>>>    				unsigned long address, pmd_t *pmd, int flags)
>>>>>    {
>>>>> diff --git a/mm/gup.c b/mm/gup.c
>>>>> index 3b656b7..87a94f5 100644
>>>>> --- a/mm/gup.c
>>>>> +++ b/mm/gup.c
>>>>> @@ -534,6 +534,17 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
>>>>>    	if (unlikely(pmd_bad(*pmd)))
>>>>>    		return no_page_table(vma, flags);
>>>>>    
>>>>> +	/*
>>>>> +	 * Considering PTE level hugetlb, like continuous-PTE hugetlb on
>>>>> +	 * ARM64 architecture.
>>>>> +	 */
>>>>> +	if (is_vm_hugetlb_page(vma)) {
>>>>> +		page = follow_huge_pte(vma, address, pmd, flags);
>>>>> +		if (page)
>>>>> +			return page;
>>>>> +		return no_page_table(vma, flags);
>>>>> +	}
>>>>> +
>>>>>    	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
>>>>>    	pte = *ptep;
>>>>>    	if (!pte_present(pte)) {
>>>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>>>> index 6c00ba1..cf742d1 100644
>>>>> --- a/mm/hugetlb.c
>>>>> +++ b/mm/hugetlb.c
>>>>> @@ -6981,6 +6981,59 @@ struct page * __weak
>>>>>    	return NULL;
>>>>>    }
>>>>>    
>>>>> +/* Support looking up a CONT-PTE size hugetlb page. */
>>>>> +struct page * __weak
>>>>> +follow_huge_pte(struct vm_area_struct *vma, unsigned long address,
>>>>> +		pmd_t *pmd, int flags)
>>>>> +{
>>>>> +	struct mm_struct *mm = vma->vm_mm;
>>>>> +	struct hstate *hstate = hstate_vma(vma);
>>>>> +	unsigned long size = huge_page_size(hstate);
>>>>> +	struct page *page = NULL;
>>>>> +	spinlock_t *ptl;
>>>>> +	pte_t *ptep, pte;
>>>>> +
>>>>> +	/*
>>>>> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
>>>>> +	 * follow_hugetlb_page().
>>>>> +	 */
>>>>> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
>>>>> +		return NULL;
>>>>> +
>>>>> +	ptep = huge_pte_offset(mm, address, size);
>>>>> +	if (!ptep)
>>>>> +		return NULL;
>>>>> +
>>>>> +retry:
>>>>> +	ptl = huge_pte_lock(hstate, mm, ptep);
>>>>> +	pte = huge_ptep_get(ptep);
>>>>> +	if (pte_present(pte)) {
>>>>> +		page = pte_page(pte);
>>>>> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
>>>>> +			page = NULL;
>>>>> +			goto out;
>>>>> +		}
>>>>> +	} else {
>>>>> +		if (!(flags & FOLL_MIGRATION)) {
>>>>> +			page = NULL;
>>>>> +			goto out;
>>>>> +		}
>>>>> +
>>>>> +		if (is_hugetlb_entry_migration(pte)) {
>>>>> +			spin_unlock(ptl);
>>>>> +			__migration_entry_wait_huge(ptep, ptl);
>>>>> +			goto retry;
>>>>> +		}
>>>>> +		/*
>>>>> +		 * hwpoisoned entry is treated as no_page_table in
>>>>> +		 * follow_page_mask().
>>>>> +		 */
>>>>> +	}
>>>>> +out:
>>>>> +	spin_unlock(ptl);
>>>>> +	return page;
>>>>> +}
>>>>> +
>>>>>    struct page * __weak
>>>>>    follow_huge_pmd(struct mm_struct *mm, unsigned long address,
>>>>>    		pmd_t *pmd, int flags)
>>>>
>>>>
>>>> Can someone explain why:
>>>> * follow_page() goes via follow_page_mask() for hugetlb
>>>> * __get_user_pages() goes via follow_hugetlb_page() and never via
>>>>     follow_page_mask() for hugetlb?
>>>>
>>>> IOW, why can't we make follow_page_mask() just not handle hugetlb and
>>>> route everything via follow_hugetlb_page() -- we primarily only have to
>>>> teach it to not trigger faults.
> 
> I have no idea how we got into this situation, and do agree that it
> makes little sense for both follow_page_mask and follow_hugetlb_page to
> do page table walking differently for hugetlb pages.
> 
> I think I have noted elsewhere that all those follow_huge_p*d rotines
> will look the same.  It seems they were just added as needed when the
> follow_page_mask page table walking code was fleshed out.  This also
> needs a cleanup.  If we eliminate hugetlb handling from follow_page_mask,
> perhaps we can get rid of all these?
> 
>>>
>>> IMHO, these follow_huge_xxx() functions are arch-specified at first and
>>> were moved into the common hugetlb.c by commit 9e5fc74c3025 ("mm:
>>> hugetlb: Copy general hugetlb code from x86 to mm"), and now there are
>>> still some arch-specified follow_huge_xxx() definition, for example:
>>> ia64: follow_huge_addr
>>> powerpc: follow_huge_pd
>>> s390: follow_huge_pud
>>>
>>> What I mean is that follow_hugetlb_page() is a common and
>>> not-arch-specified function, is it suitable to change it to be
>>> arch-specified?
>>> And thinking more, can we rename follow_hugetlb_page() as
>>> hugetlb_page_faultin() and simplify it to only handle the page faults of
>>> hugetlb like the faultin_page() for normal page? That means we can make
>>> sure only follow_page_mask() can handle hugetlb.
>>>
> 
> Something like that might work, but you still have two page table walkers
> for hugetlb.  I like David's idea (if I understand it correctly) of

What I mean is we may change the hugetlb handling like normal page:
1) use follow_page_mask() to look up a hugetlb firstly.
2) if can not get the hugetlb, then try to page fault by 
hugetlb_page_faultin().
3) if page fault successed, then retry to find hugetlb by 
follow_page_mask().

Just a rough thought, and I need more investigation for my idea and 
David's idea.

> using follow_hugetlb_page for both cases.  As noted, it will need to be
> taught how to not trigger faults in the follow_page_mask case.

Anyway, I also agree we need some cleanup, and firstly I think we should 
cleanup these arch-specified follow_huge_xxx() on some architectures 
which are similar with the common ones. I will look into these.

However, considering cleanup may need more investigation and 
refactoring, now I prefer to make these bug-fix patches of this patchset 
into mainline firstly, which are suitable to backport to old version to 
fix potential race issues. Mike and David, how do you think? Could you 
help to review these patches? Thanks.
David Hildenbrand Aug. 24, 2022, 7:31 a.m. UTC | #6
>>>>
>>>> IMHO, these follow_huge_xxx() functions are arch-specified at first and
>>>> were moved into the common hugetlb.c by commit 9e5fc74c3025 ("mm:
>>>> hugetlb: Copy general hugetlb code from x86 to mm"), and now there are
>>>> still some arch-specified follow_huge_xxx() definition, for example:
>>>> ia64: follow_huge_addr
>>>> powerpc: follow_huge_pd
>>>> s390: follow_huge_pud
>>>>
>>>> What I mean is that follow_hugetlb_page() is a common and
>>>> not-arch-specified function, is it suitable to change it to be
>>>> arch-specified?
>>>> And thinking more, can we rename follow_hugetlb_page() as
>>>> hugetlb_page_faultin() and simplify it to only handle the page faults of
>>>> hugetlb like the faultin_page() for normal page? That means we can make
>>>> sure only follow_page_mask() can handle hugetlb.
>>>>
>>
>> Something like that might work, but you still have two page table walkers
>> for hugetlb.  I like David's idea (if I understand it correctly) of
> 
> What I mean is we may change the hugetlb handling like normal page:
> 1) use follow_page_mask() to look up a hugetlb firstly.
> 2) if can not get the hugetlb, then try to page fault by 
> hugetlb_page_faultin().
> 3) if page fault successed, then retry to find hugetlb by 
> follow_page_mask().

That implies putting more hugetlbfs special code into generic GUP,
turning it even more complicated. But of course, it depends on how the
end result looks like. My gut feeling was that hugetlb is better handled
in follow_hugetlb_page() separately (just like we do with a lot of other
page table walkers).

> 
> Just a rough thought, and I need more investigation for my idea and 
> David's idea.
> 
>> using follow_hugetlb_page for both cases.  As noted, it will need to be
>> taught how to not trigger faults in the follow_page_mask case.
> 
> Anyway, I also agree we need some cleanup, and firstly I think we should 
> cleanup these arch-specified follow_huge_xxx() on some architectures 
> which are similar with the common ones. I will look into these.

There was a recent discussion on that, e.g.:

https://lkml.kernel.org/r/20220818135717.609eef8a@thinkpad

> 
> However, considering cleanup may need more investigation and 
> refactoring, now I prefer to make these bug-fix patches of this patchset 
> into mainline firstly, which are suitable to backport to old version to 
> fix potential race issues. Mike and David, how do you think? Could you 
> help to review these patches? Thanks.

Patch #1 certainly add more special code just to handle another hugetlb
corner case (CONT pages), and maybe just making it all use
follow_hugetlb_page() would be even cleaner and less error prone.

I agree that locking is shaky, but I'm not sure if we really want to
backport this to stable trees:

https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html

"It must fix a real bug that bothers people (not a, “This could be a
problem...” type thing)."


Do we actually have any instance of this being a real (and not a
theoretical) problem? If not, I'd rather clean it all up right away.
Baolin Wang Aug. 24, 2022, 9:41 a.m. UTC | #7
On 8/24/2022 3:31 PM, David Hildenbrand wrote:
>>>>>
>>>>> IMHO, these follow_huge_xxx() functions are arch-specified at first and
>>>>> were moved into the common hugetlb.c by commit 9e5fc74c3025 ("mm:
>>>>> hugetlb: Copy general hugetlb code from x86 to mm"), and now there are
>>>>> still some arch-specified follow_huge_xxx() definition, for example:
>>>>> ia64: follow_huge_addr
>>>>> powerpc: follow_huge_pd
>>>>> s390: follow_huge_pud
>>>>>
>>>>> What I mean is that follow_hugetlb_page() is a common and
>>>>> not-arch-specified function, is it suitable to change it to be
>>>>> arch-specified?
>>>>> And thinking more, can we rename follow_hugetlb_page() as
>>>>> hugetlb_page_faultin() and simplify it to only handle the page faults of
>>>>> hugetlb like the faultin_page() for normal page? That means we can make
>>>>> sure only follow_page_mask() can handle hugetlb.
>>>>>
>>>
>>> Something like that might work, but you still have two page table walkers
>>> for hugetlb.  I like David's idea (if I understand it correctly) of
>>
>> What I mean is we may change the hugetlb handling like normal page:
>> 1) use follow_page_mask() to look up a hugetlb firstly.
>> 2) if can not get the hugetlb, then try to page fault by
>> hugetlb_page_faultin().
>> 3) if page fault successed, then retry to find hugetlb by
>> follow_page_mask().
> 
> That implies putting more hugetlbfs special code into generic GUP,
> turning it even more complicated. But of course, it depends on how the
> end result looks like. My gut feeling was that hugetlb is better handled
> in follow_hugetlb_page() separately (just like we do with a lot of other
> page table walkers).

OK, fair enough.

>>
>> Just a rough thought, and I need more investigation for my idea and
>> David's idea.
>>
>>> using follow_hugetlb_page for both cases.  As noted, it will need to be
>>> taught how to not trigger faults in the follow_page_mask case.
>>
>> Anyway, I also agree we need some cleanup, and firstly I think we should
>> cleanup these arch-specified follow_huge_xxx() on some architectures
>> which are similar with the common ones. I will look into these.
> 
> There was a recent discussion on that, e.g.:
> 
> https://lkml.kernel.org/r/20220818135717.609eef8a@thinkpad

Thanks.

> 
>>
>> However, considering cleanup may need more investigation and
>> refactoring, now I prefer to make these bug-fix patches of this patchset
>> into mainline firstly, which are suitable to backport to old version to
>> fix potential race issues. Mike and David, how do you think? Could you
>> help to review these patches? Thanks.
> 
> Patch #1 certainly add more special code just to handle another hugetlb
> corner case (CONT pages), and maybe just making it all use
> follow_hugetlb_page() would be even cleaner and less error prone.
> 
> I agree that locking is shaky, but I'm not sure if we really want to
> backport this to stable trees:
> 
> https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
> 
> "It must fix a real bug that bothers people (not a, “This could be a
> problem...” type thing)."
> 
> 
> Do we actually have any instance of this being a real (and not a
> theoretical) problem? If not, I'd rather clean it all up right away.

I think this is a real problem (not theoretical), and easy to write some 
code to show the issue. For example, suppose thread A is trying to look 
up a CONT-PTE size hugetlb page under the lock, however antoher thread B 
can migrate the CONT-PTE hugetlb page at the same time, which will cause 
thread A to get an incorrect page, if thread A want to do something for 
this incorrect page, error occurs.

Actually we also want to backport these fixes to the distro with old 
kernel versions to make the hugetlb more stable. Otherwise we must hit 
these issues sooner or later if the customers use CONT-PTE/PMD hugetlb.

Anyway, if you and Mike still think these issues are not important 
enough to be fixed in the old versions, I can do the cleanup firstly.
David Hildenbrand Aug. 24, 2022, 11:55 a.m. UTC | #8
On 24.08.22 11:41, Baolin Wang wrote:
> 
> 
> On 8/24/2022 3:31 PM, David Hildenbrand wrote:
>>>>>>
>>>>>> IMHO, these follow_huge_xxx() functions are arch-specified at first and
>>>>>> were moved into the common hugetlb.c by commit 9e5fc74c3025 ("mm:
>>>>>> hugetlb: Copy general hugetlb code from x86 to mm"), and now there are
>>>>>> still some arch-specified follow_huge_xxx() definition, for example:
>>>>>> ia64: follow_huge_addr
>>>>>> powerpc: follow_huge_pd
>>>>>> s390: follow_huge_pud
>>>>>>
>>>>>> What I mean is that follow_hugetlb_page() is a common and
>>>>>> not-arch-specified function, is it suitable to change it to be
>>>>>> arch-specified?
>>>>>> And thinking more, can we rename follow_hugetlb_page() as
>>>>>> hugetlb_page_faultin() and simplify it to only handle the page faults of
>>>>>> hugetlb like the faultin_page() for normal page? That means we can make
>>>>>> sure only follow_page_mask() can handle hugetlb.
>>>>>>
>>>>
>>>> Something like that might work, but you still have two page table walkers
>>>> for hugetlb.  I like David's idea (if I understand it correctly) of
>>>
>>> What I mean is we may change the hugetlb handling like normal page:
>>> 1) use follow_page_mask() to look up a hugetlb firstly.
>>> 2) if can not get the hugetlb, then try to page fault by
>>> hugetlb_page_faultin().
>>> 3) if page fault successed, then retry to find hugetlb by
>>> follow_page_mask().
>>
>> That implies putting more hugetlbfs special code into generic GUP,
>> turning it even more complicated. But of course, it depends on how the
>> end result looks like. My gut feeling was that hugetlb is better handled
>> in follow_hugetlb_page() separately (just like we do with a lot of other
>> page table walkers).
> 
> OK, fair enough.
> 
>>>
>>> Just a rough thought, and I need more investigation for my idea and
>>> David's idea.
>>>
>>>> using follow_hugetlb_page for both cases.  As noted, it will need to be
>>>> taught how to not trigger faults in the follow_page_mask case.
>>>
>>> Anyway, I also agree we need some cleanup, and firstly I think we should
>>> cleanup these arch-specified follow_huge_xxx() on some architectures
>>> which are similar with the common ones. I will look into these.
>>
>> There was a recent discussion on that, e.g.:
>>
>> https://lkml.kernel.org/r/20220818135717.609eef8a@thinkpad
> 
> Thanks.
> 
>>
>>>
>>> However, considering cleanup may need more investigation and
>>> refactoring, now I prefer to make these bug-fix patches of this patchset
>>> into mainline firstly, which are suitable to backport to old version to
>>> fix potential race issues. Mike and David, how do you think? Could you
>>> help to review these patches? Thanks.
>>
>> Patch #1 certainly add more special code just to handle another hugetlb
>> corner case (CONT pages), and maybe just making it all use
>> follow_hugetlb_page() would be even cleaner and less error prone.
>>
>> I agree that locking is shaky, but I'm not sure if we really want to
>> backport this to stable trees:
>>
>> https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
>>
>> "It must fix a real bug that bothers people (not a, “This could be a
>> problem...” type thing)."
>>
>>
>> Do we actually have any instance of this being a real (and not a
>> theoretical) problem? If not, I'd rather clean it all up right away.
> 
> I think this is a real problem (not theoretical), and easy to write some 
> code to show the issue. For example, suppose thread A is trying to look 
> up a CONT-PTE size hugetlb page under the lock, however antoher thread B 
> can migrate the CONT-PTE hugetlb page at the same time, which will cause 
> thread A to get an incorrect page, if thread A want to do something for 
> this incorrect page, error occurs.
> 
> Actually we also want to backport these fixes to the distro with old 
> kernel versions to make the hugetlb more stable. Otherwise we must hit 
> these issues sooner or later if the customers use CONT-PTE/PMD hugetlb.
> 
> Anyway, if you and Mike still think these issues are not important 
> enough to be fixed in the old versions, I can do the cleanup firstly.
> 

[asking myself which follow_page() users actually care about hugetlb,
and why we need this handling in follow_page at all]

Which follow_page() user do we care about here? Primarily mm/migrate.c
only I assume?
Baolin Wang Aug. 24, 2022, 2:30 p.m. UTC | #9
On 8/24/2022 7:55 PM, David Hildenbrand wrote:
> On 24.08.22 11:41, Baolin Wang wrote:
>>
>>
>> On 8/24/2022 3:31 PM, David Hildenbrand wrote:
>>>>>>>
>>>>>>> IMHO, these follow_huge_xxx() functions are arch-specified at first and
>>>>>>> were moved into the common hugetlb.c by commit 9e5fc74c3025 ("mm:
>>>>>>> hugetlb: Copy general hugetlb code from x86 to mm"), and now there are
>>>>>>> still some arch-specified follow_huge_xxx() definition, for example:
>>>>>>> ia64: follow_huge_addr
>>>>>>> powerpc: follow_huge_pd
>>>>>>> s390: follow_huge_pud
>>>>>>>
>>>>>>> What I mean is that follow_hugetlb_page() is a common and
>>>>>>> not-arch-specified function, is it suitable to change it to be
>>>>>>> arch-specified?
>>>>>>> And thinking more, can we rename follow_hugetlb_page() as
>>>>>>> hugetlb_page_faultin() and simplify it to only handle the page faults of
>>>>>>> hugetlb like the faultin_page() for normal page? That means we can make
>>>>>>> sure only follow_page_mask() can handle hugetlb.
>>>>>>>
>>>>>
>>>>> Something like that might work, but you still have two page table walkers
>>>>> for hugetlb.  I like David's idea (if I understand it correctly) of
>>>>
>>>> What I mean is we may change the hugetlb handling like normal page:
>>>> 1) use follow_page_mask() to look up a hugetlb firstly.
>>>> 2) if can not get the hugetlb, then try to page fault by
>>>> hugetlb_page_faultin().
>>>> 3) if page fault successed, then retry to find hugetlb by
>>>> follow_page_mask().
>>>
>>> That implies putting more hugetlbfs special code into generic GUP,
>>> turning it even more complicated. But of course, it depends on how the
>>> end result looks like. My gut feeling was that hugetlb is better handled
>>> in follow_hugetlb_page() separately (just like we do with a lot of other
>>> page table walkers).
>>
>> OK, fair enough.
>>
>>>>
>>>> Just a rough thought, and I need more investigation for my idea and
>>>> David's idea.
>>>>
>>>>> using follow_hugetlb_page for both cases.  As noted, it will need to be
>>>>> taught how to not trigger faults in the follow_page_mask case.
>>>>
>>>> Anyway, I also agree we need some cleanup, and firstly I think we should
>>>> cleanup these arch-specified follow_huge_xxx() on some architectures
>>>> which are similar with the common ones. I will look into these.
>>>
>>> There was a recent discussion on that, e.g.:
>>>
>>> https://lkml.kernel.org/r/20220818135717.609eef8a@thinkpad
>>
>> Thanks.
>>
>>>
>>>>
>>>> However, considering cleanup may need more investigation and
>>>> refactoring, now I prefer to make these bug-fix patches of this patchset
>>>> into mainline firstly, which are suitable to backport to old version to
>>>> fix potential race issues. Mike and David, how do you think? Could you
>>>> help to review these patches? Thanks.
>>>
>>> Patch #1 certainly add more special code just to handle another hugetlb
>>> corner case (CONT pages), and maybe just making it all use
>>> follow_hugetlb_page() would be even cleaner and less error prone.
>>>
>>> I agree that locking is shaky, but I'm not sure if we really want to
>>> backport this to stable trees:
>>>
>>> https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
>>>
>>> "It must fix a real bug that bothers people (not a, “This could be a
>>> problem...” type thing)."
>>>
>>>
>>> Do we actually have any instance of this being a real (and not a
>>> theoretical) problem? If not, I'd rather clean it all up right away.
>>
>> I think this is a real problem (not theoretical), and easy to write some
>> code to show the issue. For example, suppose thread A is trying to look
>> up a CONT-PTE size hugetlb page under the lock, however antoher thread B
>> can migrate the CONT-PTE hugetlb page at the same time, which will cause
>> thread A to get an incorrect page, if thread A want to do something for
>> this incorrect page, error occurs.
>>
>> Actually we also want to backport these fixes to the distro with old
>> kernel versions to make the hugetlb more stable. Otherwise we must hit
>> these issues sooner or later if the customers use CONT-PTE/PMD hugetlb.
>>
>> Anyway, if you and Mike still think these issues are not important
>> enough to be fixed in the old versions, I can do the cleanup firstly.
>>
> 
> [asking myself which follow_page() users actually care about hugetlb,
> and why we need this handling in follow_page at all]
> 
> Which follow_page() user do we care about here? Primarily mm/migrate.c
> only I assume?

Right, mainly affects the move_pages() syscall I think. Yes, I can not 
know all of the users of the move_pages() syscall now or in the future 
in our data center, but like I said the move_pages() syscall + hugetlb 
can be a real potential stability issue.
David Hildenbrand Aug. 24, 2022, 2:33 p.m. UTC | #10
On 24.08.22 16:30, Baolin Wang wrote:
> 
> 
> On 8/24/2022 7:55 PM, David Hildenbrand wrote:
>> On 24.08.22 11:41, Baolin Wang wrote:
>>>
>>>
>>> On 8/24/2022 3:31 PM, David Hildenbrand wrote:
>>>>>>>>
>>>>>>>> IMHO, these follow_huge_xxx() functions are arch-specified at first and
>>>>>>>> were moved into the common hugetlb.c by commit 9e5fc74c3025 ("mm:
>>>>>>>> hugetlb: Copy general hugetlb code from x86 to mm"), and now there are
>>>>>>>> still some arch-specified follow_huge_xxx() definition, for example:
>>>>>>>> ia64: follow_huge_addr
>>>>>>>> powerpc: follow_huge_pd
>>>>>>>> s390: follow_huge_pud
>>>>>>>>
>>>>>>>> What I mean is that follow_hugetlb_page() is a common and
>>>>>>>> not-arch-specified function, is it suitable to change it to be
>>>>>>>> arch-specified?
>>>>>>>> And thinking more, can we rename follow_hugetlb_page() as
>>>>>>>> hugetlb_page_faultin() and simplify it to only handle the page faults of
>>>>>>>> hugetlb like the faultin_page() for normal page? That means we can make
>>>>>>>> sure only follow_page_mask() can handle hugetlb.
>>>>>>>>
>>>>>>
>>>>>> Something like that might work, but you still have two page table walkers
>>>>>> for hugetlb.  I like David's idea (if I understand it correctly) of
>>>>>
>>>>> What I mean is we may change the hugetlb handling like normal page:
>>>>> 1) use follow_page_mask() to look up a hugetlb firstly.
>>>>> 2) if can not get the hugetlb, then try to page fault by
>>>>> hugetlb_page_faultin().
>>>>> 3) if page fault successed, then retry to find hugetlb by
>>>>> follow_page_mask().
>>>>
>>>> That implies putting more hugetlbfs special code into generic GUP,
>>>> turning it even more complicated. But of course, it depends on how the
>>>> end result looks like. My gut feeling was that hugetlb is better handled
>>>> in follow_hugetlb_page() separately (just like we do with a lot of other
>>>> page table walkers).
>>>
>>> OK, fair enough.
>>>
>>>>>
>>>>> Just a rough thought, and I need more investigation for my idea and
>>>>> David's idea.
>>>>>
>>>>>> using follow_hugetlb_page for both cases.  As noted, it will need to be
>>>>>> taught how to not trigger faults in the follow_page_mask case.
>>>>>
>>>>> Anyway, I also agree we need some cleanup, and firstly I think we should
>>>>> cleanup these arch-specified follow_huge_xxx() on some architectures
>>>>> which are similar with the common ones. I will look into these.
>>>>
>>>> There was a recent discussion on that, e.g.:
>>>>
>>>> https://lkml.kernel.org/r/20220818135717.609eef8a@thinkpad
>>>
>>> Thanks.
>>>
>>>>
>>>>>
>>>>> However, considering cleanup may need more investigation and
>>>>> refactoring, now I prefer to make these bug-fix patches of this patchset
>>>>> into mainline firstly, which are suitable to backport to old version to
>>>>> fix potential race issues. Mike and David, how do you think? Could you
>>>>> help to review these patches? Thanks.
>>>>
>>>> Patch #1 certainly add more special code just to handle another hugetlb
>>>> corner case (CONT pages), and maybe just making it all use
>>>> follow_hugetlb_page() would be even cleaner and less error prone.
>>>>
>>>> I agree that locking is shaky, but I'm not sure if we really want to
>>>> backport this to stable trees:
>>>>
>>>> https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
>>>>
>>>> "It must fix a real bug that bothers people (not a, “This could be a
>>>> problem...” type thing)."
>>>>
>>>>
>>>> Do we actually have any instance of this being a real (and not a
>>>> theoretical) problem? If not, I'd rather clean it all up right away.
>>>
>>> I think this is a real problem (not theoretical), and easy to write some
>>> code to show the issue. For example, suppose thread A is trying to look
>>> up a CONT-PTE size hugetlb page under the lock, however antoher thread B
>>> can migrate the CONT-PTE hugetlb page at the same time, which will cause
>>> thread A to get an incorrect page, if thread A want to do something for
>>> this incorrect page, error occurs.
>>>
>>> Actually we also want to backport these fixes to the distro with old
>>> kernel versions to make the hugetlb more stable. Otherwise we must hit
>>> these issues sooner or later if the customers use CONT-PTE/PMD hugetlb.
>>>
>>> Anyway, if you and Mike still think these issues are not important
>>> enough to be fixed in the old versions, I can do the cleanup firstly.
>>>
>>
>> [asking myself which follow_page() users actually care about hugetlb,
>> and why we need this handling in follow_page at all]
>>
>> Which follow_page() user do we care about here? Primarily mm/migrate.c
>> only I assume?
> 
> Right, mainly affects the move_pages() syscall I think. Yes, I can not 
> know all of the users of the move_pages() syscall now or in the future 
> in our data center, but like I said the move_pages() syscall + hugetlb 
> can be a real potential stability issue.
> 

I wonder if we can get rid of follow_page() completely, there are not
too many users. Or alternatively simply make it use general GUP
infrastructure more clearly. We'd need something like FOLL_NOFAULT that
also covers "absolutely no faults".
Baolin Wang Aug. 24, 2022, 3:06 p.m. UTC | #11
On 8/24/2022 10:33 PM, David Hildenbrand wrote:
> On 24.08.22 16:30, Baolin Wang wrote:
>>
>>
>> On 8/24/2022 7:55 PM, David Hildenbrand wrote:
>>> On 24.08.22 11:41, Baolin Wang wrote:
>>>>
>>>>
>>>> On 8/24/2022 3:31 PM, David Hildenbrand wrote:
>>>>>>>>>
>>>>>>>>> IMHO, these follow_huge_xxx() functions are arch-specified at first and
>>>>>>>>> were moved into the common hugetlb.c by commit 9e5fc74c3025 ("mm:
>>>>>>>>> hugetlb: Copy general hugetlb code from x86 to mm"), and now there are
>>>>>>>>> still some arch-specified follow_huge_xxx() definition, for example:
>>>>>>>>> ia64: follow_huge_addr
>>>>>>>>> powerpc: follow_huge_pd
>>>>>>>>> s390: follow_huge_pud
>>>>>>>>>
>>>>>>>>> What I mean is that follow_hugetlb_page() is a common and
>>>>>>>>> not-arch-specified function, is it suitable to change it to be
>>>>>>>>> arch-specified?
>>>>>>>>> And thinking more, can we rename follow_hugetlb_page() as
>>>>>>>>> hugetlb_page_faultin() and simplify it to only handle the page faults of
>>>>>>>>> hugetlb like the faultin_page() for normal page? That means we can make
>>>>>>>>> sure only follow_page_mask() can handle hugetlb.
>>>>>>>>>
>>>>>>>
>>>>>>> Something like that might work, but you still have two page table walkers
>>>>>>> for hugetlb.  I like David's idea (if I understand it correctly) of
>>>>>>
>>>>>> What I mean is we may change the hugetlb handling like normal page:
>>>>>> 1) use follow_page_mask() to look up a hugetlb firstly.
>>>>>> 2) if can not get the hugetlb, then try to page fault by
>>>>>> hugetlb_page_faultin().
>>>>>> 3) if page fault successed, then retry to find hugetlb by
>>>>>> follow_page_mask().
>>>>>
>>>>> That implies putting more hugetlbfs special code into generic GUP,
>>>>> turning it even more complicated. But of course, it depends on how the
>>>>> end result looks like. My gut feeling was that hugetlb is better handled
>>>>> in follow_hugetlb_page() separately (just like we do with a lot of other
>>>>> page table walkers).
>>>>
>>>> OK, fair enough.
>>>>
>>>>>>
>>>>>> Just a rough thought, and I need more investigation for my idea and
>>>>>> David's idea.
>>>>>>
>>>>>>> using follow_hugetlb_page for both cases.  As noted, it will need to be
>>>>>>> taught how to not trigger faults in the follow_page_mask case.
>>>>>>
>>>>>> Anyway, I also agree we need some cleanup, and firstly I think we should
>>>>>> cleanup these arch-specified follow_huge_xxx() on some architectures
>>>>>> which are similar with the common ones. I will look into these.
>>>>>
>>>>> There was a recent discussion on that, e.g.:
>>>>>
>>>>> https://lkml.kernel.org/r/20220818135717.609eef8a@thinkpad
>>>>
>>>> Thanks.
>>>>
>>>>>
>>>>>>
>>>>>> However, considering cleanup may need more investigation and
>>>>>> refactoring, now I prefer to make these bug-fix patches of this patchset
>>>>>> into mainline firstly, which are suitable to backport to old version to
>>>>>> fix potential race issues. Mike and David, how do you think? Could you
>>>>>> help to review these patches? Thanks.
>>>>>
>>>>> Patch #1 certainly add more special code just to handle another hugetlb
>>>>> corner case (CONT pages), and maybe just making it all use
>>>>> follow_hugetlb_page() would be even cleaner and less error prone.
>>>>>
>>>>> I agree that locking is shaky, but I'm not sure if we really want to
>>>>> backport this to stable trees:
>>>>>
>>>>> https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
>>>>>
>>>>> "It must fix a real bug that bothers people (not a, “This could be a
>>>>> problem...” type thing)."
>>>>>
>>>>>
>>>>> Do we actually have any instance of this being a real (and not a
>>>>> theoretical) problem? If not, I'd rather clean it all up right away.
>>>>
>>>> I think this is a real problem (not theoretical), and easy to write some
>>>> code to show the issue. For example, suppose thread A is trying to look
>>>> up a CONT-PTE size hugetlb page under the lock, however antoher thread B
>>>> can migrate the CONT-PTE hugetlb page at the same time, which will cause
>>>> thread A to get an incorrect page, if thread A want to do something for
>>>> this incorrect page, error occurs.
>>>>
>>>> Actually we also want to backport these fixes to the distro with old
>>>> kernel versions to make the hugetlb more stable. Otherwise we must hit
>>>> these issues sooner or later if the customers use CONT-PTE/PMD hugetlb.
>>>>
>>>> Anyway, if you and Mike still think these issues are not important
>>>> enough to be fixed in the old versions, I can do the cleanup firstly.
>>>>
>>>
>>> [asking myself which follow_page() users actually care about hugetlb,
>>> and why we need this handling in follow_page at all]
>>>
>>> Which follow_page() user do we care about here? Primarily mm/migrate.c
>>> only I assume?
>>
>> Right, mainly affects the move_pages() syscall I think. Yes, I can not
>> know all of the users of the move_pages() syscall now or in the future
>> in our data center, but like I said the move_pages() syscall + hugetlb
>> can be a real potential stability issue.
>>
> 
> I wonder if we can get rid of follow_page() completely, there are not
> too many users. Or alternatively simply make it use general GUP
> infrastructure more clearly. We'd need something like FOLL_NOFAULT that
> also covers "absolutely no faults".

I am not sure I get your point. So you want change to use 
__get_user_pages() (or silimar wrappers) to look up a normal page or 
hugetlb instead of follow_page()? and adding a new FOLL_NOFAULT flag to 
__get_user_pages().

If I understand correctly, we still need more work to move those 
arch-specified follow_huge_xxx() into follow_hugetlb_page() firstly like 
we disscussed before? Which seems not backportable too.

I am not againt your idea, and I also agree that we should do some 
cleanup. But the point is if we need backport patches to fix this issue, 
which affects move_pages() syscall, if the answer is yes, I think my 
current fixes are suitable to backport.
David Hildenbrand Aug. 24, 2022, 3:13 p.m. UTC | #12
On 24.08.22 17:06, Baolin Wang wrote:
> 
> 
> On 8/24/2022 10:33 PM, David Hildenbrand wrote:
>> On 24.08.22 16:30, Baolin Wang wrote:
>>>
>>>
>>> On 8/24/2022 7:55 PM, David Hildenbrand wrote:
>>>> On 24.08.22 11:41, Baolin Wang wrote:
>>>>>
>>>>>
>>>>> On 8/24/2022 3:31 PM, David Hildenbrand wrote:
>>>>>>>>>>
>>>>>>>>>> IMHO, these follow_huge_xxx() functions are arch-specified at first and
>>>>>>>>>> were moved into the common hugetlb.c by commit 9e5fc74c3025 ("mm:
>>>>>>>>>> hugetlb: Copy general hugetlb code from x86 to mm"), and now there are
>>>>>>>>>> still some arch-specified follow_huge_xxx() definition, for example:
>>>>>>>>>> ia64: follow_huge_addr
>>>>>>>>>> powerpc: follow_huge_pd
>>>>>>>>>> s390: follow_huge_pud
>>>>>>>>>>
>>>>>>>>>> What I mean is that follow_hugetlb_page() is a common and
>>>>>>>>>> not-arch-specified function, is it suitable to change it to be
>>>>>>>>>> arch-specified?
>>>>>>>>>> And thinking more, can we rename follow_hugetlb_page() as
>>>>>>>>>> hugetlb_page_faultin() and simplify it to only handle the page faults of
>>>>>>>>>> hugetlb like the faultin_page() for normal page? That means we can make
>>>>>>>>>> sure only follow_page_mask() can handle hugetlb.
>>>>>>>>>>
>>>>>>>>
>>>>>>>> Something like that might work, but you still have two page table walkers
>>>>>>>> for hugetlb.  I like David's idea (if I understand it correctly) of
>>>>>>>
>>>>>>> What I mean is we may change the hugetlb handling like normal page:
>>>>>>> 1) use follow_page_mask() to look up a hugetlb firstly.
>>>>>>> 2) if can not get the hugetlb, then try to page fault by
>>>>>>> hugetlb_page_faultin().
>>>>>>> 3) if page fault successed, then retry to find hugetlb by
>>>>>>> follow_page_mask().
>>>>>>
>>>>>> That implies putting more hugetlbfs special code into generic GUP,
>>>>>> turning it even more complicated. But of course, it depends on how the
>>>>>> end result looks like. My gut feeling was that hugetlb is better handled
>>>>>> in follow_hugetlb_page() separately (just like we do with a lot of other
>>>>>> page table walkers).
>>>>>
>>>>> OK, fair enough.
>>>>>
>>>>>>>
>>>>>>> Just a rough thought, and I need more investigation for my idea and
>>>>>>> David's idea.
>>>>>>>
>>>>>>>> using follow_hugetlb_page for both cases.  As noted, it will need to be
>>>>>>>> taught how to not trigger faults in the follow_page_mask case.
>>>>>>>
>>>>>>> Anyway, I also agree we need some cleanup, and firstly I think we should
>>>>>>> cleanup these arch-specified follow_huge_xxx() on some architectures
>>>>>>> which are similar with the common ones. I will look into these.
>>>>>>
>>>>>> There was a recent discussion on that, e.g.:
>>>>>>
>>>>>> https://lkml.kernel.org/r/20220818135717.609eef8a@thinkpad
>>>>>
>>>>> Thanks.
>>>>>
>>>>>>
>>>>>>>
>>>>>>> However, considering cleanup may need more investigation and
>>>>>>> refactoring, now I prefer to make these bug-fix patches of this patchset
>>>>>>> into mainline firstly, which are suitable to backport to old version to
>>>>>>> fix potential race issues. Mike and David, how do you think? Could you
>>>>>>> help to review these patches? Thanks.
>>>>>>
>>>>>> Patch #1 certainly add more special code just to handle another hugetlb
>>>>>> corner case (CONT pages), and maybe just making it all use
>>>>>> follow_hugetlb_page() would be even cleaner and less error prone.
>>>>>>
>>>>>> I agree that locking is shaky, but I'm not sure if we really want to
>>>>>> backport this to stable trees:
>>>>>>
>>>>>> https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
>>>>>>
>>>>>> "It must fix a real bug that bothers people (not a, “This could be a
>>>>>> problem...” type thing)."
>>>>>>
>>>>>>
>>>>>> Do we actually have any instance of this being a real (and not a
>>>>>> theoretical) problem? If not, I'd rather clean it all up right away.
>>>>>
>>>>> I think this is a real problem (not theoretical), and easy to write some
>>>>> code to show the issue. For example, suppose thread A is trying to look
>>>>> up a CONT-PTE size hugetlb page under the lock, however antoher thread B
>>>>> can migrate the CONT-PTE hugetlb page at the same time, which will cause
>>>>> thread A to get an incorrect page, if thread A want to do something for
>>>>> this incorrect page, error occurs.
>>>>>
>>>>> Actually we also want to backport these fixes to the distro with old
>>>>> kernel versions to make the hugetlb more stable. Otherwise we must hit
>>>>> these issues sooner or later if the customers use CONT-PTE/PMD hugetlb.
>>>>>
>>>>> Anyway, if you and Mike still think these issues are not important
>>>>> enough to be fixed in the old versions, I can do the cleanup firstly.
>>>>>
>>>>
>>>> [asking myself which follow_page() users actually care about hugetlb,
>>>> and why we need this handling in follow_page at all]
>>>>
>>>> Which follow_page() user do we care about here? Primarily mm/migrate.c
>>>> only I assume?
>>>
>>> Right, mainly affects the move_pages() syscall I think. Yes, I can not
>>> know all of the users of the move_pages() syscall now or in the future
>>> in our data center, but like I said the move_pages() syscall + hugetlb
>>> can be a real potential stability issue.
>>>
>>
>> I wonder if we can get rid of follow_page() completely, there are not
>> too many users. Or alternatively simply make it use general GUP
>> infrastructure more clearly. We'd need something like FOLL_NOFAULT that
>> also covers "absolutely no faults".
> 
> I am not sure I get your point. So you want change to use 
> __get_user_pages() (or silimar wrappers) to look up a normal page or 
> hugetlb instead of follow_page()? and adding a new FOLL_NOFAULT flag to 
> __get_user_pages().

Essentially just getting rid of follow_page() completely or making it a
wrapper of __get_user_pages().

> 
> If I understand correctly, we still need more work to move those 
> arch-specified follow_huge_xxx() into follow_hugetlb_page() firstly like 
> we disscussed before? Which seems not backportable too.

I'm not sure we need all that magic in these arch specific helpers after
all. I haven't looked into the details, but I really wonder why they
handle something that follow_hugetlb_page() cannot easily handle. It all
smells like legacy cruft.

> 
> I am not againt your idea, and I also agree that we should do some 
> cleanup. But the point is if we need backport patches to fix this issue, 
> which affects move_pages() syscall, if the answer is yes, I think my 
> current fixes are suitable to backport.

I really don't like adding more make-legacy-cruft-happy code unless
there is *real* need for it. (you could always just fix old kernels you
care about with your patches here -- do they have to be in mainline?
don't think so)

But of course, it's up to Mike to decide, just my 2 cents :)
Baolin Wang Aug. 24, 2022, 3:23 p.m. UTC | #13
On 8/24/2022 11:13 PM, David Hildenbrand wrote:
> On 24.08.22 17:06, Baolin Wang wrote:
>>
>>
>> On 8/24/2022 10:33 PM, David Hildenbrand wrote:
>>> On 24.08.22 16:30, Baolin Wang wrote:
>>>>
>>>>
>>>> On 8/24/2022 7:55 PM, David Hildenbrand wrote:
>>>>> On 24.08.22 11:41, Baolin Wang wrote:
>>>>>>
>>>>>>
>>>>>> On 8/24/2022 3:31 PM, David Hildenbrand wrote:
>>>>>>>>>>>
>>>>>>>>>>> IMHO, these follow_huge_xxx() functions are arch-specified at first and
>>>>>>>>>>> were moved into the common hugetlb.c by commit 9e5fc74c3025 ("mm:
>>>>>>>>>>> hugetlb: Copy general hugetlb code from x86 to mm"), and now there are
>>>>>>>>>>> still some arch-specified follow_huge_xxx() definition, for example:
>>>>>>>>>>> ia64: follow_huge_addr
>>>>>>>>>>> powerpc: follow_huge_pd
>>>>>>>>>>> s390: follow_huge_pud
>>>>>>>>>>>
>>>>>>>>>>> What I mean is that follow_hugetlb_page() is a common and
>>>>>>>>>>> not-arch-specified function, is it suitable to change it to be
>>>>>>>>>>> arch-specified?
>>>>>>>>>>> And thinking more, can we rename follow_hugetlb_page() as
>>>>>>>>>>> hugetlb_page_faultin() and simplify it to only handle the page faults of
>>>>>>>>>>> hugetlb like the faultin_page() for normal page? That means we can make
>>>>>>>>>>> sure only follow_page_mask() can handle hugetlb.
>>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Something like that might work, but you still have two page table walkers
>>>>>>>>> for hugetlb.  I like David's idea (if I understand it correctly) of
>>>>>>>>
>>>>>>>> What I mean is we may change the hugetlb handling like normal page:
>>>>>>>> 1) use follow_page_mask() to look up a hugetlb firstly.
>>>>>>>> 2) if can not get the hugetlb, then try to page fault by
>>>>>>>> hugetlb_page_faultin().
>>>>>>>> 3) if page fault successed, then retry to find hugetlb by
>>>>>>>> follow_page_mask().
>>>>>>>
>>>>>>> That implies putting more hugetlbfs special code into generic GUP,
>>>>>>> turning it even more complicated. But of course, it depends on how the
>>>>>>> end result looks like. My gut feeling was that hugetlb is better handled
>>>>>>> in follow_hugetlb_page() separately (just like we do with a lot of other
>>>>>>> page table walkers).
>>>>>>
>>>>>> OK, fair enough.
>>>>>>
>>>>>>>>
>>>>>>>> Just a rough thought, and I need more investigation for my idea and
>>>>>>>> David's idea.
>>>>>>>>
>>>>>>>>> using follow_hugetlb_page for both cases.  As noted, it will need to be
>>>>>>>>> taught how to not trigger faults in the follow_page_mask case.
>>>>>>>>
>>>>>>>> Anyway, I also agree we need some cleanup, and firstly I think we should
>>>>>>>> cleanup these arch-specified follow_huge_xxx() on some architectures
>>>>>>>> which are similar with the common ones. I will look into these.
>>>>>>>
>>>>>>> There was a recent discussion on that, e.g.:
>>>>>>>
>>>>>>> https://lkml.kernel.org/r/20220818135717.609eef8a@thinkpad
>>>>>>
>>>>>> Thanks.
>>>>>>
>>>>>>>
>>>>>>>>
>>>>>>>> However, considering cleanup may need more investigation and
>>>>>>>> refactoring, now I prefer to make these bug-fix patches of this patchset
>>>>>>>> into mainline firstly, which are suitable to backport to old version to
>>>>>>>> fix potential race issues. Mike and David, how do you think? Could you
>>>>>>>> help to review these patches? Thanks.
>>>>>>>
>>>>>>> Patch #1 certainly add more special code just to handle another hugetlb
>>>>>>> corner case (CONT pages), and maybe just making it all use
>>>>>>> follow_hugetlb_page() would be even cleaner and less error prone.
>>>>>>>
>>>>>>> I agree that locking is shaky, but I'm not sure if we really want to
>>>>>>> backport this to stable trees:
>>>>>>>
>>>>>>> https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
>>>>>>>
>>>>>>> "It must fix a real bug that bothers people (not a, “This could be a
>>>>>>> problem...” type thing)."
>>>>>>>
>>>>>>>
>>>>>>> Do we actually have any instance of this being a real (and not a
>>>>>>> theoretical) problem? If not, I'd rather clean it all up right away.
>>>>>>
>>>>>> I think this is a real problem (not theoretical), and easy to write some
>>>>>> code to show the issue. For example, suppose thread A is trying to look
>>>>>> up a CONT-PTE size hugetlb page under the lock, however antoher thread B
>>>>>> can migrate the CONT-PTE hugetlb page at the same time, which will cause
>>>>>> thread A to get an incorrect page, if thread A want to do something for
>>>>>> this incorrect page, error occurs.
>>>>>>
>>>>>> Actually we also want to backport these fixes to the distro with old
>>>>>> kernel versions to make the hugetlb more stable. Otherwise we must hit
>>>>>> these issues sooner or later if the customers use CONT-PTE/PMD hugetlb.
>>>>>>
>>>>>> Anyway, if you and Mike still think these issues are not important
>>>>>> enough to be fixed in the old versions, I can do the cleanup firstly.
>>>>>>
>>>>>
>>>>> [asking myself which follow_page() users actually care about hugetlb,
>>>>> and why we need this handling in follow_page at all]
>>>>>
>>>>> Which follow_page() user do we care about here? Primarily mm/migrate.c
>>>>> only I assume?
>>>>
>>>> Right, mainly affects the move_pages() syscall I think. Yes, I can not
>>>> know all of the users of the move_pages() syscall now or in the future
>>>> in our data center, but like I said the move_pages() syscall + hugetlb
>>>> can be a real potential stability issue.
>>>>
>>>
>>> I wonder if we can get rid of follow_page() completely, there are not
>>> too many users. Or alternatively simply make it use general GUP
>>> infrastructure more clearly. We'd need something like FOLL_NOFAULT that
>>> also covers "absolutely no faults".
>>
>> I am not sure I get your point. So you want change to use
>> __get_user_pages() (or silimar wrappers) to look up a normal page or
>> hugetlb instead of follow_page()? and adding a new FOLL_NOFAULT flag to
>> __get_user_pages().
> 
> Essentially just getting rid of follow_page() completely or making it a
> wrapper of __get_user_pages().

OK.

> 
>>
>> If I understand correctly, we still need more work to move those
>> arch-specified follow_huge_xxx() into follow_hugetlb_page() firstly like
>> we disscussed before? Which seems not backportable too.
> 
> I'm not sure we need all that magic in these arch specific helpers after
> all. I haven't looked into the details, but I really wonder why they
> handle something that follow_hugetlb_page() cannot easily handle. It all
> smells like legacy cruft.

Agree, not sure if there are some historical legacy issue, need more 
investigation.

> 
>>
>> I am not againt your idea, and I also agree that we should do some
>> cleanup. But the point is if we need backport patches to fix this issue,
>> which affects move_pages() syscall, if the answer is yes, I think my
>> current fixes are suitable to backport.
> 
> I really don't like adding more make-legacy-cruft-happy code unless
> there is *real* need for it. (you could always just fix old kernels you
> care about with your patches here -- do they have to be in mainline?

Not have to, but better...

> don't think so)
> 
> But of course, it's up to Mike to decide, just my 2 cents :)

Thanks for your useful comments, and let's see what's the view of Mike :)
Mike Kravetz Aug. 24, 2022, 11:34 p.m. UTC | #14
On 08/24/22 17:41, Baolin Wang wrote:
> 
> 
> On 8/24/2022 3:31 PM, David Hildenbrand wrote:
> > > > > > 
> > > > > > IMHO, these follow_huge_xxx() functions are arch-specified at first and
> > > > > > were moved into the common hugetlb.c by commit 9e5fc74c3025 ("mm:
> > > > > > hugetlb: Copy general hugetlb code from x86 to mm"), and now there are
> > > > > > still some arch-specified follow_huge_xxx() definition, for example:
> > > > > > ia64: follow_huge_addr
> > > > > > powerpc: follow_huge_pd
> > > > > > s390: follow_huge_pud
> > > > > > 
> > > > > > What I mean is that follow_hugetlb_page() is a common and
> > > > > > not-arch-specified function, is it suitable to change it to be
> > > > > > arch-specified?
> > > > > > And thinking more, can we rename follow_hugetlb_page() as
> > > > > > hugetlb_page_faultin() and simplify it to only handle the page faults of
> > > > > > hugetlb like the faultin_page() for normal page? That means we can make
> > > > > > sure only follow_page_mask() can handle hugetlb.
> > > > > > 
> > > > 
> > > > Something like that might work, but you still have two page table walkers
> > > > for hugetlb.  I like David's idea (if I understand it correctly) of
> > > 
> > > What I mean is we may change the hugetlb handling like normal page:
> > > 1) use follow_page_mask() to look up a hugetlb firstly.
> > > 2) if can not get the hugetlb, then try to page fault by
> > > hugetlb_page_faultin().
> > > 3) if page fault successed, then retry to find hugetlb by
> > > follow_page_mask().
> > 
> > That implies putting more hugetlbfs special code into generic GUP,
> > turning it even more complicated. But of course, it depends on how the
> > end result looks like. My gut feeling was that hugetlb is better handled
> > in follow_hugetlb_page() separately (just like we do with a lot of other
> > page table walkers).
> 
> OK, fair enough.
> 
> > > 
> > > Just a rough thought, and I need more investigation for my idea and
> > > David's idea.
> > > 
> > > > using follow_hugetlb_page for both cases.  As noted, it will need to be
> > > > taught how to not trigger faults in the follow_page_mask case.
> > > 
> > > Anyway, I also agree we need some cleanup, and firstly I think we should
> > > cleanup these arch-specified follow_huge_xxx() on some architectures
> > > which are similar with the common ones. I will look into these.
> > 
> > There was a recent discussion on that, e.g.:
> > 
> > https://lkml.kernel.org/r/20220818135717.609eef8a@thinkpad
> 
> Thanks.
> 
> > 
> > > 
> > > However, considering cleanup may need more investigation and
> > > refactoring, now I prefer to make these bug-fix patches of this patchset
> > > into mainline firstly, which are suitable to backport to old version to
> > > fix potential race issues. Mike and David, how do you think? Could you
> > > help to review these patches? Thanks.
> > 
> > Patch #1 certainly add more special code just to handle another hugetlb
> > corner case (CONT pages), and maybe just making it all use
> > follow_hugetlb_page() would be even cleaner and less error prone.
> > 
> > I agree that locking is shaky, but I'm not sure if we really want to
> > backport this to stable trees:
> > 
> > https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
> > 
> > "It must fix a real bug that bothers people (not a, “This could be a
> > problem...” type thing)."
> > 
> > 
> > Do we actually have any instance of this being a real (and not a
> > theoretical) problem? If not, I'd rather clean it all up right away.
> 
> I think this is a real problem (not theoretical), and easy to write some
> code to show the issue. For example, suppose thread A is trying to look up a
> CONT-PTE size hugetlb page under the lock, however antoher thread B can
> migrate the CONT-PTE hugetlb page at the same time, which will cause thread
> A to get an incorrect page, if thread A want to do something for this
> incorrect page, error occurs.

Is the primary concern the locking?  If so, I am not sure we have an issue.
As mentioned in your commit message, current code will use
pte_offset_map_lock().  pte_offset_map_lock uses pte_lockptr, and pte_lockptr
will either be the mm wide lock or pmd_page lock.  To me, it seems that
either would provide correct synchronization for CONT-PTE entries.  Am I
missing something or misreading the code?

I started looking at code cleanup suggested by David.  Here is a quick
patch (not tested and likely containing errors) to see if this is a step
in the right direction.

I like it because we get rid of/combine all those follow_huge_p*d
routines.

From 35d117a707c1567ddf350554298697d40eace0d7 Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Wed, 24 Aug 2022 15:59:15 -0700
Subject: [PATCH] hugetlb: call hugetlb_follow_page_mask for hugetlb pages in
 follow_page_mask

At the beginning of follow_page_mask, there currently is a call to
follow_huge_addr which 'may' handle hugetlb pages.  ia64 is the only
architecture which (incorrectly) provides a follow_huge_addr routine
that does not return error.  Instead, at each level of the page table a
check is made for a hugetlb entry.  If a hugetlb entry is found, a call
to a routine associated with that page table level such as
follow_huge_pmd is made.

All the follow_huge_p*d routines are basically the same.  In addition
huge page size can be derived from the vma, so we know where in the page
table a huge page would reside.  So, replace follow_huge_addr with a
new architecture independent routine which will provide the same
functionality as the follow_huge_p*d routines.  We can then eliminate
the p*d_huge checks in follow_page_mask page table walking as well as
the follow_huge_p*d routines themselves.

follow_page_mask still has is_hugepd hugetlb checks during page table
walking.  This is due to these checks and follow_huge_pd being
architecture specific.  These can be eliminated if
hugetlb_follow_page_mask can be overwritten by architectures (powerpc)
that need to do follow_huge_pd processing.

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
---
 arch/ia64/mm/hugetlbpage.c |  15 ----
 arch/s390/mm/hugetlbpage.c |  10 ---
 include/linux/hugetlb.h    |  41 +++-------
 mm/gup.c                   |  27 +------
 mm/hugetlb.c               | 159 ++++++++++++-------------------------
 5 files changed, 62 insertions(+), 190 deletions(-)

diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index f993cb36c062..380d2f3966c9 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -91,21 +91,6 @@ int prepare_hugepage_range(struct file *file,
 	return 0;
 }
 
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
-{
-	struct page *page;
-	pte_t *ptep;
-
-	if (REGION_NUMBER(addr) != RGN_HPAGE)
-		return ERR_PTR(-EINVAL);
-
-	ptep = huge_pte_offset(mm, addr, HPAGE_SIZE);
-	if (!ptep || pte_none(*ptep))
-		return NULL;
-	page = pte_page(*ptep);
-	page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
-	return page;
-}
 int pmd_huge(pmd_t pmd)
 {
 	return 0;
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 10e51ef9c79a..c299a18273ff 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -237,16 +237,6 @@ int pud_huge(pud_t pud)
 	return pud_large(pud);
 }
 
-struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
-		pud_t *pud, int flags)
-{
-	if (flags & FOLL_GET)
-		return NULL;
-
-	return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
-}
-
 bool __init arch_hugetlb_valid_size(unsigned long size)
 {
 	if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3ec981a0d8b3..0c19d200c851 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -142,6 +142,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 			     unsigned long len);
 int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
 			    struct vm_area_struct *, struct vm_area_struct *);
+struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+                              unsigned long address, unsigned int flags);
 long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
 			 struct page **, struct vm_area_struct **,
 			 unsigned long *, unsigned long *, long, unsigned int,
@@ -202,17 +204,9 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep);
 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
 				unsigned long *start, unsigned long *end);
-struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write);
 struct page *follow_huge_pd(struct vm_area_struct *vma,
 			    unsigned long address, hugepd_t hpd,
 			    int flags, int pdshift);
-struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-				pmd_t *pmd, int flags);
-struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
-				pud_t *pud, int flags);
-struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
-			     pgd_t *pgd, int flags);
 
 int pmd_huge(pmd_t pmd);
 int pud_huge(pud_t pud);
@@ -257,6 +251,13 @@ static inline void adjust_range_if_pmd_sharing_possible(
 {
 }
 
+static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+                              unsigned long address, unsigned int flags)
+{
+	/* should never happen, but do not want to BUG */
+	return ERR_PTR(-EINVAL);
+}
+
 static inline long follow_hugetlb_page(struct mm_struct *mm,
 			struct vm_area_struct *vma, struct page **pages,
 			struct vm_area_struct **vmas, unsigned long *position,
@@ -267,12 +268,6 @@ static inline long follow_hugetlb_page(struct mm_struct *mm,
 	return 0;
 }
 
-static inline struct page *follow_huge_addr(struct mm_struct *mm,
-					unsigned long address, int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 static inline int copy_hugetlb_page_range(struct mm_struct *dst,
 					  struct mm_struct *src,
 					  struct vm_area_struct *dst_vma,
@@ -312,24 +307,6 @@ static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
 	return NULL;
 }
 
-static inline struct page *follow_huge_pmd(struct mm_struct *mm,
-				unsigned long address, pmd_t *pmd, int flags)
-{
-	return NULL;
-}
-
-static inline struct page *follow_huge_pud(struct mm_struct *mm,
-				unsigned long address, pud_t *pud, int flags)
-{
-	return NULL;
-}
-
-static inline struct page *follow_huge_pgd(struct mm_struct *mm,
-				unsigned long address, pgd_t *pgd, int flags)
-{
-	return NULL;
-}
-
 static inline int prepare_hugepage_range(struct file *file,
 				unsigned long addr, unsigned long len)
 {
diff --git a/mm/gup.c b/mm/gup.c
index 3b656b7e8a3c..a93c04437faa 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -661,12 +661,6 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
 	pmdval = READ_ONCE(*pmd);
 	if (pmd_none(pmdval))
 		return no_page_table(vma, flags);
-	if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
-		page = follow_huge_pmd(mm, address, pmd, flags);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
 	if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
 		page = follow_huge_pd(vma, address,
 				      __hugepd(pmd_val(pmdval)), flags,
@@ -764,12 +758,6 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
 	pud = pud_offset(p4dp, address);
 	if (pud_none(*pud))
 		return no_page_table(vma, flags);
-	if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
-		page = follow_huge_pud(mm, address, pud, flags);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
 	if (is_hugepd(__hugepd(pud_val(*pud)))) {
 		page = follow_huge_pd(vma, address,
 				      __hugepd(pud_val(*pud)), flags,
@@ -851,24 +839,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
 
 	ctx->page_mask = 0;
 
-	/* make this handle hugepd */
-	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
-	if (!IS_ERR(page)) {
-		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
-		return page;
-	}
+	/* hugetlb is special */
+	if (is_vm_hugetlb_page(vma))
+		return hugetlb_follow_page_mask(vma, address, flags);
 
 	pgd = pgd_offset(mm, address);
 
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
 		return no_page_table(vma, flags);
 
-	if (pgd_huge(*pgd)) {
-		page = follow_huge_pgd(mm, address, pgd, flags);
-		if (page)
-			return page;
-		return no_page_table(vma, flags);
-	}
 	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
 		page = follow_huge_pd(vma, address,
 				      __hugepd(pgd_val(*pgd)), flags,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6c00ba1dde32..947401df8190 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6168,6 +6168,56 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
 	return false;
 }
 
+struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
+				unsigned long address, unsigned int flags)
+{
+	struct hstate *h = hstate_vma(vma);
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long haddr = address & huge_page_mask(h);
+	struct page *page = NULL;
+	spinlock_t *ptl;
+	pte_t *pte, entry;
+
+	/*
+	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
+	 * follow_hugetlb_page().
+	 */
+	if (WARN_ON_ONCE(flags & FOLL_PIN))
+		return NULL;
+
+	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
+	if (!pte)
+		return NULL;
+
+retry:
+	ptl = huge_pte_lock(h, mm, pte);
+	entry = huge_ptep_get(pte);
+	if (pte_present(entry)) {
+		page = pte_page(entry);
+		/*
+		 * try_grab_page() should always succeed here, because we hold
+		 * the ptl lock and have verified pte_present().
+		 */
+		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+			page = NULL;
+			goto out;
+		}
+	} else {
+		if (is_hugetlb_entry_migration(entry)) {
+			spin_unlock(ptl);
+			__migration_entry_wait_huge(pte, ptl);
+			goto retry;
+		}
+		/*
+		 * hwpoisoned entry is treated as no_page_table in
+		 * follow_page_mask().
+		 */
+	}
+out:
+	spin_unlock(ptl);
+	return page;
+}
+
 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			 struct page **pages, struct vm_area_struct **vmas,
 			 unsigned long *position, unsigned long *nr_pages,
@@ -6966,13 +7016,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
  * These functions are overwritable if your architecture needs its own
  * behavior.
  */
-struct page * __weak
-follow_huge_addr(struct mm_struct *mm, unsigned long address,
-			      int write)
-{
-	return ERR_PTR(-EINVAL);
-}
-
 struct page * __weak
 follow_huge_pd(struct vm_area_struct *vma,
 	       unsigned long address, hugepd_t hpd, int flags, int pdshift)
@@ -6981,108 +7024,6 @@ follow_huge_pd(struct vm_area_struct *vma,
 	return NULL;
 }
 
-struct page * __weak
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
-		pmd_t *pmd, int flags)
-{
-	struct page *page = NULL;
-	spinlock_t *ptl;
-	pte_t pte;
-
-	/*
-	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
-	 * follow_hugetlb_page().
-	 */
-	if (WARN_ON_ONCE(flags & FOLL_PIN))
-		return NULL;
-
-retry:
-	ptl = pmd_lockptr(mm, pmd);
-	spin_lock(ptl);
-	/*
-	 * make sure that the address range covered by this pmd is not
-	 * unmapped from other threads.
-	 */
-	if (!pmd_huge(*pmd))
-		goto out;
-	pte = huge_ptep_get((pte_t *)pmd);
-	if (pte_present(pte)) {
-		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
-		/*
-		 * try_grab_page() should always succeed here, because: a) we
-		 * hold the pmd (ptl) lock, and b) we've just checked that the
-		 * huge pmd (head) page is present in the page tables. The ptl
-		 * prevents the head page and tail pages from being rearranged
-		 * in any way. So this page must be available at this point,
-		 * unless the page refcount overflowed:
-		 */
-		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
-			page = NULL;
-			goto out;
-		}
-	} else {
-		if (is_hugetlb_entry_migration(pte)) {
-			spin_unlock(ptl);
-			__migration_entry_wait_huge((pte_t *)pmd, ptl);
-			goto retry;
-		}
-		/*
-		 * hwpoisoned entry is treated as no_page_table in
-		 * follow_page_mask().
-		 */
-	}
-out:
-	spin_unlock(ptl);
-	return page;
-}
-
-struct page * __weak
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
-		pud_t *pud, int flags)
-{
-	struct page *page = NULL;
-	spinlock_t *ptl;
-	pte_t pte;
-
-	if (WARN_ON_ONCE(flags & FOLL_PIN))
-		return NULL;
-
-retry:
-	ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
-	if (!pud_huge(*pud))
-		goto out;
-	pte = huge_ptep_get((pte_t *)pud);
-	if (pte_present(pte)) {
-		page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
-		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
-			page = NULL;
-			goto out;
-		}
-	} else {
-		if (is_hugetlb_entry_migration(pte)) {
-			spin_unlock(ptl);
-			__migration_entry_wait(mm, (pte_t *)pud, ptl);
-			goto retry;
-		}
-		/*
-		 * hwpoisoned entry is treated as no_page_table in
-		 * follow_page_mask().
-		 */
-	}
-out:
-	spin_unlock(ptl);
-	return page;
-}
-
-struct page * __weak
-follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
-{
-	if (flags & (FOLL_GET | FOLL_PIN))
-		return NULL;
-
-	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
-}
-
 int isolate_hugetlb(struct page *page, struct list_head *list)
 {
 	int ret = 0;
Baolin Wang Aug. 25, 2022, 1:43 a.m. UTC | #15
On 8/25/2022 7:34 AM, Mike Kravetz wrote:
> On 08/24/22 17:41, Baolin Wang wrote:
>>
>>
>> On 8/24/2022 3:31 PM, David Hildenbrand wrote:
>>>>>>>
>>>>>>> IMHO, these follow_huge_xxx() functions are arch-specified at first and
>>>>>>> were moved into the common hugetlb.c by commit 9e5fc74c3025 ("mm:
>>>>>>> hugetlb: Copy general hugetlb code from x86 to mm"), and now there are
>>>>>>> still some arch-specified follow_huge_xxx() definition, for example:
>>>>>>> ia64: follow_huge_addr
>>>>>>> powerpc: follow_huge_pd
>>>>>>> s390: follow_huge_pud
>>>>>>>
>>>>>>> What I mean is that follow_hugetlb_page() is a common and
>>>>>>> not-arch-specified function, is it suitable to change it to be
>>>>>>> arch-specified?
>>>>>>> And thinking more, can we rename follow_hugetlb_page() as
>>>>>>> hugetlb_page_faultin() and simplify it to only handle the page faults of
>>>>>>> hugetlb like the faultin_page() for normal page? That means we can make
>>>>>>> sure only follow_page_mask() can handle hugetlb.
>>>>>>>
>>>>>
>>>>> Something like that might work, but you still have two page table walkers
>>>>> for hugetlb.  I like David's idea (if I understand it correctly) of
>>>>
>>>> What I mean is we may change the hugetlb handling like normal page:
>>>> 1) use follow_page_mask() to look up a hugetlb firstly.
>>>> 2) if can not get the hugetlb, then try to page fault by
>>>> hugetlb_page_faultin().
>>>> 3) if page fault successed, then retry to find hugetlb by
>>>> follow_page_mask().
>>>
>>> That implies putting more hugetlbfs special code into generic GUP,
>>> turning it even more complicated. But of course, it depends on how the
>>> end result looks like. My gut feeling was that hugetlb is better handled
>>> in follow_hugetlb_page() separately (just like we do with a lot of other
>>> page table walkers).
>>
>> OK, fair enough.
>>
>>>>
>>>> Just a rough thought, and I need more investigation for my idea and
>>>> David's idea.
>>>>
>>>>> using follow_hugetlb_page for both cases.  As noted, it will need to be
>>>>> taught how to not trigger faults in the follow_page_mask case.
>>>>
>>>> Anyway, I also agree we need some cleanup, and firstly I think we should
>>>> cleanup these arch-specified follow_huge_xxx() on some architectures
>>>> which are similar with the common ones. I will look into these.
>>>
>>> There was a recent discussion on that, e.g.:
>>>
>>> https://lkml.kernel.org/r/20220818135717.609eef8a@thinkpad
>>
>> Thanks.
>>
>>>
>>>>
>>>> However, considering cleanup may need more investigation and
>>>> refactoring, now I prefer to make these bug-fix patches of this patchset
>>>> into mainline firstly, which are suitable to backport to old version to
>>>> fix potential race issues. Mike and David, how do you think? Could you
>>>> help to review these patches? Thanks.
>>>
>>> Patch #1 certainly add more special code just to handle another hugetlb
>>> corner case (CONT pages), and maybe just making it all use
>>> follow_hugetlb_page() would be even cleaner and less error prone.
>>>
>>> I agree that locking is shaky, but I'm not sure if we really want to
>>> backport this to stable trees:
>>>
>>> https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
>>>
>>> "It must fix a real bug that bothers people (not a, “This could be a
>>> problem...” type thing)."
>>>
>>>
>>> Do we actually have any instance of this being a real (and not a
>>> theoretical) problem? If not, I'd rather clean it all up right away.
>>
>> I think this is a real problem (not theoretical), and easy to write some
>> code to show the issue. For example, suppose thread A is trying to look up a
>> CONT-PTE size hugetlb page under the lock, however antoher thread B can
>> migrate the CONT-PTE hugetlb page at the same time, which will cause thread
>> A to get an incorrect page, if thread A want to do something for this
>> incorrect page, error occurs.
> 
> Is the primary concern the locking?  If so, I am not sure we have an issue.

Yes.

> As mentioned in your commit message, current code will use
> pte_offset_map_lock().  pte_offset_map_lock uses pte_lockptr, and pte_lockptr
> will either be the mm wide lock or pmd_page lock.  To me, it seems that

The ALLOC_SPLIT_PTLOCKS can be always true on my machine, that means the 
pte_lockptr() will always use the PTE page lock, however huge_pte_lock() 
will use the mm wide lock.

> either would provide correct synchronization for CONT-PTE entries.  Am I
> missing something or misreading the code?
> 
> I started looking at code cleanup suggested by David.  Here is a quick
> patch (not tested and likely containing errors) to see if this is a step
> in the right direction.
> 
> I like it because we get rid of/combine all those follow_huge_p*d
> routines.

Great, this looks straight forward to me (some nits as below).
David, how do you think?

> 
>  From 35d117a707c1567ddf350554298697d40eace0d7 Mon Sep 17 00:00:00 2001
> From: Mike Kravetz <mike.kravetz@oracle.com>
> Date: Wed, 24 Aug 2022 15:59:15 -0700
> Subject: [PATCH] hugetlb: call hugetlb_follow_page_mask for hugetlb pages in
>   follow_page_mask
> 
> At the beginning of follow_page_mask, there currently is a call to
> follow_huge_addr which 'may' handle hugetlb pages.  ia64 is the only
> architecture which (incorrectly) provides a follow_huge_addr routine
> that does not return error.  Instead, at each level of the page table a
> check is made for a hugetlb entry.  If a hugetlb entry is found, a call
> to a routine associated with that page table level such as
> follow_huge_pmd is made.
> 
> All the follow_huge_p*d routines are basically the same.  In addition
> huge page size can be derived from the vma, so we know where in the page
> table a huge page would reside.  So, replace follow_huge_addr with a
> new architecture independent routine which will provide the same
> functionality as the follow_huge_p*d routines.  We can then eliminate
> the p*d_huge checks in follow_page_mask page table walking as well as
> the follow_huge_p*d routines themselves.
> 
> follow_page_mask still has is_hugepd hugetlb checks during page table
> walking.  This is due to these checks and follow_huge_pd being
> architecture specific.  These can be eliminated if
> hugetlb_follow_page_mask can be overwritten by architectures (powerpc)
> that need to do follow_huge_pd processing.
> 
> Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
> ---
>   arch/ia64/mm/hugetlbpage.c |  15 ----
>   arch/s390/mm/hugetlbpage.c |  10 ---
>   include/linux/hugetlb.h    |  41 +++-------
>   mm/gup.c                   |  27 +------
>   mm/hugetlb.c               | 159 ++++++++++++-------------------------
>   5 files changed, 62 insertions(+), 190 deletions(-)
> 
> diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
> index f993cb36c062..380d2f3966c9 100644
> --- a/arch/ia64/mm/hugetlbpage.c
> +++ b/arch/ia64/mm/hugetlbpage.c
> @@ -91,21 +91,6 @@ int prepare_hugepage_range(struct file *file,
>   	return 0;
>   }
>   
> -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long addr, int write)
> -{
> -	struct page *page;
> -	pte_t *ptep;
> -
> -	if (REGION_NUMBER(addr) != RGN_HPAGE)
> -		return ERR_PTR(-EINVAL);
> -
> -	ptep = huge_pte_offset(mm, addr, HPAGE_SIZE);
> -	if (!ptep || pte_none(*ptep))
> -		return NULL;
> -	page = pte_page(*ptep);
> -	page += ((addr & ~HPAGE_MASK) >> PAGE_SHIFT);
> -	return page;
> -}
>   int pmd_huge(pmd_t pmd)
>   {
>   	return 0;
> diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
> index 10e51ef9c79a..c299a18273ff 100644
> --- a/arch/s390/mm/hugetlbpage.c
> +++ b/arch/s390/mm/hugetlbpage.c
> @@ -237,16 +237,6 @@ int pud_huge(pud_t pud)
>   	return pud_large(pud);
>   }
>   
> -struct page *
> -follow_huge_pud(struct mm_struct *mm, unsigned long address,
> -		pud_t *pud, int flags)
> -{
> -	if (flags & FOLL_GET)
> -		return NULL;
> -
> -	return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
> -}
> -
>   bool __init arch_hugetlb_valid_size(unsigned long size)
>   {
>   	if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 3ec981a0d8b3..0c19d200c851 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -142,6 +142,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
>   			     unsigned long len);
>   int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
>   			    struct vm_area_struct *, struct vm_area_struct *);
> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +                              unsigned long address, unsigned int flags);
>   long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
>   			 struct page **, struct vm_area_struct **,
>   			 unsigned long *, unsigned long *, long, unsigned int,
> @@ -202,17 +204,9 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
>   				unsigned long addr, pte_t *ptep);
>   void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
>   				unsigned long *start, unsigned long *end);
> -struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
> -			      int write);
>   struct page *follow_huge_pd(struct vm_area_struct *vma,
>   			    unsigned long address, hugepd_t hpd,
>   			    int flags, int pdshift);
> -struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
> -				pmd_t *pmd, int flags);
> -struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
> -				pud_t *pud, int flags);
> -struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
> -			     pgd_t *pgd, int flags);
>   
>   int pmd_huge(pmd_t pmd);
>   int pud_huge(pud_t pud);
> @@ -257,6 +251,13 @@ static inline void adjust_range_if_pmd_sharing_possible(
>   {
>   }
>   
> +static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +                              unsigned long address, unsigned int flags)
> +{
> +	/* should never happen, but do not want to BUG */
> +	return ERR_PTR(-EINVAL);
> +}
> +
>   static inline long follow_hugetlb_page(struct mm_struct *mm,
>   			struct vm_area_struct *vma, struct page **pages,
>   			struct vm_area_struct **vmas, unsigned long *position,
> @@ -267,12 +268,6 @@ static inline long follow_hugetlb_page(struct mm_struct *mm,
>   	return 0;
>   }
>   
> -static inline struct page *follow_huge_addr(struct mm_struct *mm,
> -					unsigned long address, int write)
> -{
> -	return ERR_PTR(-EINVAL);
> -}
> -
>   static inline int copy_hugetlb_page_range(struct mm_struct *dst,
>   					  struct mm_struct *src,
>   					  struct vm_area_struct *dst_vma,
> @@ -312,24 +307,6 @@ static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
>   	return NULL;
>   }
>   
> -static inline struct page *follow_huge_pmd(struct mm_struct *mm,
> -				unsigned long address, pmd_t *pmd, int flags)
> -{
> -	return NULL;
> -}
> -
> -static inline struct page *follow_huge_pud(struct mm_struct *mm,
> -				unsigned long address, pud_t *pud, int flags)
> -{
> -	return NULL;
> -}
> -
> -static inline struct page *follow_huge_pgd(struct mm_struct *mm,
> -				unsigned long address, pgd_t *pgd, int flags)
> -{
> -	return NULL;
> -}
> -
>   static inline int prepare_hugepage_range(struct file *file,
>   				unsigned long addr, unsigned long len)
>   {
> diff --git a/mm/gup.c b/mm/gup.c
> index 3b656b7e8a3c..a93c04437faa 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -661,12 +661,6 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
>   	pmdval = READ_ONCE(*pmd);
>   	if (pmd_none(pmdval))
>   		return no_page_table(vma, flags);
> -	if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
> -		page = follow_huge_pmd(mm, address, pmd, flags);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
>   	if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
>   		page = follow_huge_pd(vma, address,
>   				      __hugepd(pmd_val(pmdval)), flags,
> @@ -764,12 +758,6 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma,
>   	pud = pud_offset(p4dp, address);
>   	if (pud_none(*pud))
>   		return no_page_table(vma, flags);
> -	if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
> -		page = follow_huge_pud(mm, address, pud, flags);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
>   	if (is_hugepd(__hugepd(pud_val(*pud)))) {
>   		page = follow_huge_pd(vma, address,
>   				      __hugepd(pud_val(*pud)), flags,
> @@ -851,24 +839,15 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
>   
>   	ctx->page_mask = 0;
>   
> -	/* make this handle hugepd */
> -	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
> -	if (!IS_ERR(page)) {
> -		WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
> -		return page;
> -	}
> +	/* hugetlb is special */
> +	if (is_vm_hugetlb_page(vma))
> +		return hugetlb_follow_page_mask(vma, address, flags);
>   
>   	pgd = pgd_offset(mm, address);
>   
>   	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
>   		return no_page_table(vma, flags);
>   
> -	if (pgd_huge(*pgd)) {
> -		page = follow_huge_pgd(mm, address, pgd, flags);
> -		if (page)
> -			return page;
> -		return no_page_table(vma, flags);
> -	}
>   	if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
>   		page = follow_huge_pd(vma, address,
>   				      __hugepd(pgd_val(*pgd)), flags,
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 6c00ba1dde32..947401df8190 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -6168,6 +6168,56 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
>   	return false;
>   }
>   
> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	struct hstate *h = hstate_vma(vma);
> +	struct mm_struct *mm = vma->vm_mm;
> +	unsigned long haddr = address & huge_page_mask(h);
> +	struct page *page = NULL;
> +	spinlock_t *ptl;
> +	pte_t *pte, entry;
> +
> +	/*
> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> +	 * follow_hugetlb_page().
> +	 */
> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
> +		return NULL;
> +
> +	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
> +	if (!pte)
> +		return NULL;
> +
> +retry:
> +	ptl = huge_pte_lock(h, mm, pte);
> +	entry = huge_ptep_get(pte);
> +	if (pte_present(entry)) {
> +		page = pte_page(entry);

Should follow previous logic?
page = pte_page(entry) + ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);

> +		/*
> +		 * try_grab_page() should always succeed here, because we hold
> +		 * the ptl lock and have verified pte_present().
> +		 */
> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> +			page = NULL;
> +			goto out;
> +		}
> +	} else {

Should add FOLL_MIGRATION validation before waiting a migration entry.

> +		if (is_hugetlb_entry_migration(entry)) {
> +			spin_unlock(ptl);
> +			__migration_entry_wait_huge(pte, ptl);
> +			goto retry;
> +		}
> +		/*
> +		 * hwpoisoned entry is treated as no_page_table in
> +		 * follow_page_mask().
> +		 */
> +	}
> +out:
> +	spin_unlock(ptl);
> +	return page;
> +}
> +
>   long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
>   			 struct page **pages, struct vm_area_struct **vmas,
>   			 unsigned long *position, unsigned long *nr_pages,
> @@ -6966,13 +7016,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
>    * These functions are overwritable if your architecture needs its own
>    * behavior.
>    */
> -struct page * __weak
> -follow_huge_addr(struct mm_struct *mm, unsigned long address,
> -			      int write)
> -{
> -	return ERR_PTR(-EINVAL);
> -}
> -
>   struct page * __weak
>   follow_huge_pd(struct vm_area_struct *vma,
>   	       unsigned long address, hugepd_t hpd, int flags, int pdshift)
> @@ -6981,108 +7024,6 @@ follow_huge_pd(struct vm_area_struct *vma,
>   	return NULL;
>   }
>   
> -struct page * __weak
> -follow_huge_pmd(struct mm_struct *mm, unsigned long address,
> -		pmd_t *pmd, int flags)
> -{
> -	struct page *page = NULL;
> -	spinlock_t *ptl;
> -	pte_t pte;
> -
> -	/*
> -	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> -	 * follow_hugetlb_page().
> -	 */
> -	if (WARN_ON_ONCE(flags & FOLL_PIN))
> -		return NULL;
> -
> -retry:
> -	ptl = pmd_lockptr(mm, pmd);
> -	spin_lock(ptl);
> -	/*
> -	 * make sure that the address range covered by this pmd is not
> -	 * unmapped from other threads.
> -	 */
> -	if (!pmd_huge(*pmd))
> -		goto out;
> -	pte = huge_ptep_get((pte_t *)pmd);
> -	if (pte_present(pte)) {
> -		page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
> -		/*
> -		 * try_grab_page() should always succeed here, because: a) we
> -		 * hold the pmd (ptl) lock, and b) we've just checked that the
> -		 * huge pmd (head) page is present in the page tables. The ptl
> -		 * prevents the head page and tail pages from being rearranged
> -		 * in any way. So this page must be available at this point,
> -		 * unless the page refcount overflowed:
> -		 */
> -		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> -			page = NULL;
> -			goto out;
> -		}
> -	} else {
> -		if (is_hugetlb_entry_migration(pte)) {
> -			spin_unlock(ptl);
> -			__migration_entry_wait_huge((pte_t *)pmd, ptl);
> -			goto retry;
> -		}
> -		/*
> -		 * hwpoisoned entry is treated as no_page_table in
> -		 * follow_page_mask().
> -		 */
> -	}
> -out:
> -	spin_unlock(ptl);
> -	return page;
> -}
> -
> -struct page * __weak
> -follow_huge_pud(struct mm_struct *mm, unsigned long address,
> -		pud_t *pud, int flags)
> -{
> -	struct page *page = NULL;
> -	spinlock_t *ptl;
> -	pte_t pte;
> -
> -	if (WARN_ON_ONCE(flags & FOLL_PIN))
> -		return NULL;
> -
> -retry:
> -	ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud);
> -	if (!pud_huge(*pud))
> -		goto out;
> -	pte = huge_ptep_get((pte_t *)pud);
> -	if (pte_present(pte)) {
> -		page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
> -		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> -			page = NULL;
> -			goto out;
> -		}
> -	} else {
> -		if (is_hugetlb_entry_migration(pte)) {
> -			spin_unlock(ptl);
> -			__migration_entry_wait(mm, (pte_t *)pud, ptl);
> -			goto retry;
> -		}
> -		/*
> -		 * hwpoisoned entry is treated as no_page_table in
> -		 * follow_page_mask().
> -		 */
> -	}
> -out:
> -	spin_unlock(ptl);
> -	return page;
> -}
> -
> -struct page * __weak
> -follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
> -{
> -	if (flags & (FOLL_GET | FOLL_PIN))
> -		return NULL;
> -
> -	return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
> -}
> -
>   int isolate_hugetlb(struct page *page, struct list_head *list)
>   {
>   	int ret = 0;
David Hildenbrand Aug. 25, 2022, 7:10 a.m. UTC | #16
>> +		/*
>> +		 * try_grab_page() should always succeed here, because we hold
>> +		 * the ptl lock and have verified pte_present().
>> +		 */
>> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
>> +			page = NULL;
>> +			goto out;
>> +		}
>> +	} else {
> 
> Should add FOLL_MIGRATION validation before waiting a migration entry.

We really only need FOLL_MIGRATION for KSM. As hugetlb pages cannot be
KSM pages, we don't need this.

Actually, I do have patches in the works that rip out FOLL_MIGRATION
completely by adjusting KSM code.

So let's try to not add dead code (although it would make sense for
feature completeness as is -- but then, FOLL_MIGRATION really needs to go).
David Hildenbrand Aug. 25, 2022, 7:25 a.m. UTC | #17
> Is the primary concern the locking?  If so, I am not sure we have an issue.
> As mentioned in your commit message, current code will use
> pte_offset_map_lock().  pte_offset_map_lock uses pte_lockptr, and pte_lockptr
> will either be the mm wide lock or pmd_page lock.  To me, it seems that
> either would provide correct synchronization for CONT-PTE entries.  Am I
> missing something or misreading the code?
> 
> I started looking at code cleanup suggested by David.  Here is a quick
> patch (not tested and likely containing errors) to see if this is a step
> in the right direction.
> 
> I like it because we get rid of/combine all those follow_huge_p*d
> routines.
> 

Yes, see comments below.

> From 35d117a707c1567ddf350554298697d40eace0d7 Mon Sep 17 00:00:00 2001
> From: Mike Kravetz <mike.kravetz@oracle.com>
> Date: Wed, 24 Aug 2022 15:59:15 -0700
> Subject: [PATCH] hugetlb: call hugetlb_follow_page_mask for hugetlb pages in
>  follow_page_mask
> 
> At the beginning of follow_page_mask, there currently is a call to
> follow_huge_addr which 'may' handle hugetlb pages.  ia64 is the only
> architecture which (incorrectly) provides a follow_huge_addr routine
> that does not return error.  Instead, at each level of the page table a
> check is made for a hugetlb entry.  If a hugetlb entry is found, a call
> to a routine associated with that page table level such as
> follow_huge_pmd is made.
> 
> All the follow_huge_p*d routines are basically the same.  In addition
> huge page size can be derived from the vma, so we know where in the page
> table a huge page would reside.  So, replace follow_huge_addr with a
> new architecture independent routine which will provide the same
> functionality as the follow_huge_p*d routines.  We can then eliminate
> the p*d_huge checks in follow_page_mask page table walking as well as
> the follow_huge_p*d routines themselves.>
> follow_page_mask still has is_hugepd hugetlb checks during page table
> walking.  This is due to these checks and follow_huge_pd being
> architecture specific.  These can be eliminated if
> hugetlb_follow_page_mask can be overwritten by architectures (powerpc)
> that need to do follow_huge_pd processing.

But won't the

> +	/* hugetlb is special */
> +	if (is_vm_hugetlb_page(vma))
> +		return hugetlb_follow_page_mask(vma, address, flags);

code route everything via hugetlb_follow_page_mask() and all these
(beloved) hugepd checks would essentially be unreachable?

At least my understanding is that hugepd only applies to hugetlb.

Can't we move the hugepd handling code into hugetlb_follow_page_mask()
as well?

I mean, doesn't follow_hugetlb_page() also have to handle that hugepd
stuff already ... ?

[...]

>  
> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> +				unsigned long address, unsigned int flags)
> +{
> +	struct hstate *h = hstate_vma(vma);
> +	struct mm_struct *mm = vma->vm_mm;
> +	unsigned long haddr = address & huge_page_mask(h);
> +	struct page *page = NULL;
> +	spinlock_t *ptl;
> +	pte_t *pte, entry;
> +
> +	/*
> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> +	 * follow_hugetlb_page().
> +	 */
> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
> +		return NULL;
> +
> +	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
> +	if (!pte)
> +		return NULL;
> +
> +retry:
> +	ptl = huge_pte_lock(h, mm, pte);
> +	entry = huge_ptep_get(pte);
> +	if (pte_present(entry)) {
> +		page = pte_page(entry);
> +		/*
> +		 * try_grab_page() should always succeed here, because we hold
> +		 * the ptl lock and have verified pte_present().
> +		 */
> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
> +			page = NULL;
> +			goto out;
> +		}
> +	} else {
> +		if (is_hugetlb_entry_migration(entry)) {
> +			spin_unlock(ptl);
> +			__migration_entry_wait_huge(pte, ptl);
> +			goto retry;
> +		}
> +		/*
> +		 * hwpoisoned entry is treated as no_page_table in
> +		 * follow_page_mask().
> +		 */
> +	}
> +out:
> +	spin_unlock(ptl);
> +	return page;


This is neat and clean enough to not reuse follow_hugetlb_page(). I
wonder if we want to add some comment to the function how this differs
to follow_hugetlb_page().

... or do we maybe want to rename follow_hugetlb_page() to someting like
__hugetlb_get_user_pages() to make it clearer in which context it will
get called?


I guess it might be feasible in the future to eliminate
follow_hugetlb_page() and centralizing the faulting code. For now, this
certainly improves the situation.
Baolin Wang Aug. 25, 2022, 7:58 a.m. UTC | #18
On 8/25/2022 3:10 PM, David Hildenbrand wrote:
> 
>>> +		/*
>>> +		 * try_grab_page() should always succeed here, because we hold
>>> +		 * the ptl lock and have verified pte_present().
>>> +		 */
>>> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
>>> +			page = NULL;
>>> +			goto out;
>>> +		}
>>> +	} else {
>>
>> Should add FOLL_MIGRATION validation before waiting a migration entry.
> 
> We really only need FOLL_MIGRATION for KSM. As hugetlb pages cannot be
> KSM pages, we don't need this.
> 
> Actually, I do have patches in the works that rip out FOLL_MIGRATION
> completely by adjusting KSM code.
> 
> So let's try to not add dead code (although it would make sense for
> feature completeness as is -- but then, FOLL_MIGRATION really needs to go).

Make sense. Thanks for your explanation.
Baolin Wang Aug. 25, 2022, 10:54 a.m. UTC | #19
On 8/25/2022 3:25 PM, David Hildenbrand wrote:
>> Is the primary concern the locking?  If so, I am not sure we have an issue.
>> As mentioned in your commit message, current code will use
>> pte_offset_map_lock().  pte_offset_map_lock uses pte_lockptr, and pte_lockptr
>> will either be the mm wide lock or pmd_page lock.  To me, it seems that
>> either would provide correct synchronization for CONT-PTE entries.  Am I
>> missing something or misreading the code?
>>
>> I started looking at code cleanup suggested by David.  Here is a quick
>> patch (not tested and likely containing errors) to see if this is a step
>> in the right direction.
>>
>> I like it because we get rid of/combine all those follow_huge_p*d
>> routines.
>>
> 
> Yes, see comments below.
> 
>>  From 35d117a707c1567ddf350554298697d40eace0d7 Mon Sep 17 00:00:00 2001
>> From: Mike Kravetz <mike.kravetz@oracle.com>
>> Date: Wed, 24 Aug 2022 15:59:15 -0700
>> Subject: [PATCH] hugetlb: call hugetlb_follow_page_mask for hugetlb pages in
>>   follow_page_mask
>>
>> At the beginning of follow_page_mask, there currently is a call to
>> follow_huge_addr which 'may' handle hugetlb pages.  ia64 is the only
>> architecture which (incorrectly) provides a follow_huge_addr routine
>> that does not return error.  Instead, at each level of the page table a
>> check is made for a hugetlb entry.  If a hugetlb entry is found, a call
>> to a routine associated with that page table level such as
>> follow_huge_pmd is made.
>>
>> All the follow_huge_p*d routines are basically the same.  In addition
>> huge page size can be derived from the vma, so we know where in the page
>> table a huge page would reside.  So, replace follow_huge_addr with a
>> new architecture independent routine which will provide the same
>> functionality as the follow_huge_p*d routines.  We can then eliminate
>> the p*d_huge checks in follow_page_mask page table walking as well as
>> the follow_huge_p*d routines themselves.>
>> follow_page_mask still has is_hugepd hugetlb checks during page table
>> walking.  This is due to these checks and follow_huge_pd being
>> architecture specific.  These can be eliminated if
>> hugetlb_follow_page_mask can be overwritten by architectures (powerpc)
>> that need to do follow_huge_pd processing.
> 
> But won't the
> 
>> +	/* hugetlb is special */
>> +	if (is_vm_hugetlb_page(vma))
>> +		return hugetlb_follow_page_mask(vma, address, flags);
> 
> code route everything via hugetlb_follow_page_mask() and all these
> (beloved) hugepd checks would essentially be unreachable?
> 
> At least my understanding is that hugepd only applies to hugetlb.
> 
> Can't we move the hugepd handling code into hugetlb_follow_page_mask()
> as well?
> 
> I mean, doesn't follow_hugetlb_page() also have to handle that hugepd
> stuff already ... ?

Yes, I also think about this, and I did a simple patch (without testing) 
based on Mike's patch to make it more clean.

diff --git a/mm/gup.c b/mm/gup.c
index d3239ea63159..1003c03dcf78 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -626,14 +626,7 @@ static struct page *follow_pmd_mask(struct 
vm_area_struct *vma,
         pmdval = READ_ONCE(*pmd);
         if (pmd_none(pmdval))
                 return no_page_table(vma, flags);
-       if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
-               page = follow_huge_pd(vma, address,
-                                     __hugepd(pmd_val(pmdval)), flags,
-                                     PMD_SHIFT);
-               if (page)
-                       return page;
-               return no_page_table(vma, flags);
-       }
+
  retry:
         if (!pmd_present(pmdval)) {
                 /*
@@ -723,14 +716,6 @@ static struct page *follow_pud_mask(struct 
vm_area_struct *vma,
         pud = pud_offset(p4dp, address);
         if (pud_none(*pud))
                 return no_page_table(vma, flags);
-       if (is_hugepd(__hugepd(pud_val(*pud)))) {
-               page = follow_huge_pd(vma, address,
-                                     __hugepd(pud_val(*pud)), flags,
-                                     PUD_SHIFT);
-               if (page)
-                       return page;
-               return no_page_table(vma, flags);
-       }
         if (pud_devmap(*pud)) {
                 ptl = pud_lock(mm, pud);
                 page = follow_devmap_pud(vma, address, pud, flags, 
&ctx->pgmap);
@@ -759,14 +744,6 @@ static struct page *follow_p4d_mask(struct 
vm_area_struct *vma,
         if (unlikely(p4d_bad(*p4d)))
                 return no_page_table(vma, flags);

-       if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
-               page = follow_huge_pd(vma, address,
-                                     __hugepd(p4d_val(*p4d)), flags,
-                                     P4D_SHIFT);
-               if (page)
-                       return page;
-               return no_page_table(vma, flags);
-       }
         return follow_pud_mask(vma, address, p4d, flags, ctx);
  }

@@ -813,15 +790,6 @@ static struct page *follow_page_mask(struct 
vm_area_struct *vma,
         if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
                 return no_page_table(vma, flags);

-       if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
-               page = follow_huge_pd(vma, address,
-                                     __hugepd(pgd_val(*pgd)), flags,
-                                     PGDIR_SHIFT);
-               if (page)
-                       return page;
-               return no_page_table(vma, flags);
-       }
-
         return follow_p4d_mask(vma, address, pgd, flags, ctx);
  }

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2c107e7ebd66..848b4fb7a05d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6118,6 +6118,81 @@ static inline bool 
__follow_hugetlb_must_fault(unsigned int flags, pte_t *pte,
         return false;
  }

+static struct page *hugetlb_follow_hugepd(struct vm_area_struct *vma,
+                                         unsigned long address,
+                                         unsigned int flags)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       struct page *page;
+       pgd_t *pgd;
+       p4d_t *p4d;
+       pud_t *pud;
+       pmd_t *pmd;
+
+       pgd = pgd_offset(mm, address);
+       if (pgd_none(*pgd) || pgd_bad(*pgd))
+               return ERR_PTR(-EFAULT);
+
+       if (pgd_huge(*pgd))
+               return NULL;
+
+       if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
+               page = follow_huge_pd(vma, address,
+                                     __hugepd(pgd_val(*pgd)), flags,
+                                     PGDIR_SHIFT);
+               if (page)
+                       return page;
+               return ERR_PTR(-EFAULT);
+       }
+
+       p4d = p4d_offset(pgd, address);
+       if (p4d_none(*p4d) || p4d_bad(*p4d))
+               return ERR_PTR(-EFAULT);
+
+       if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
+               page = follow_huge_pd(vma, address,
+                                     __hugepd(p4d_val(*p4d)), flags,
+                                     P4D_SHIFT);
+               if (page)
+                       return page;
+               return ERR_PTR(-EFAULT);
+       }
+
+       pud = pud_offset(p4d, address);
+       if (pud_none(*pud) || pud_bad(*pud))
+               return ERR_PTR(-EFAULT);
+
+       if (pud_huge(*pud))
+               return NULL;
+
+       if (is_hugepd(__hugepd(pud_val(*pud)))) {
+               page = follow_huge_pd(vma, address,
+                                     __hugepd(pud_val(*pud)), flags,
+                                     PUD_SHIFT);
+               if (page)
+                       return page;
+               return ERR_PTR(-EFAULT);
+       }
+
+       pmd = pmd_offset(pud, address);+       if (pmd_none(*pmd) || 
pmd_bad(*pmd))
+                return ERR_PTR(-EFAULT);
+
+       if (pmd_huge(*pmd))
+               return NULL;
+
+       if (is_hugepd(__hugepd(pmd_val(*pmd)))) {
+               page = follow_huge_pd(vma, address,
+                                     __hugepd(pmd_val(*pmd)), flags,
+                                     PMD_SHIFT);
+               if (page)
+                       return page;
+               return ERR_PTR(-EFAULT);
+       }
+
+       return NULL;
+}
+
  struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
                                 unsigned long address, unsigned int flags)
  {
@@ -6135,6 +6210,10 @@ struct page *hugetlb_follow_page_mask(struct 
vm_area_struct *vma,
         if (WARN_ON_ONCE(flags & FOLL_PIN))
                 return NULL;

+       page = hugetlb_follow_hugepd(vma, address, flags);
+       if (page)
+               return page;
+
         pte = huge_pte_offset(mm, haddr, huge_page_size(h));
         if (!pte)
                 return NULL;


>>   
>> +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
>> +				unsigned long address, unsigned int flags)
>> +{
>> +	struct hstate *h = hstate_vma(vma);
>> +	struct mm_struct *mm = vma->vm_mm;
>> +	unsigned long haddr = address & huge_page_mask(h);
>> +	struct page *page = NULL;
>> +	spinlock_t *ptl;
>> +	pte_t *pte, entry;
>> +
>> +	/*
>> +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
>> +	 * follow_hugetlb_page().
>> +	 */
>> +	if (WARN_ON_ONCE(flags & FOLL_PIN))
>> +		return NULL;
>> +
>> +	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
>> +	if (!pte)
>> +		return NULL;
>> +
>> +retry:
>> +	ptl = huge_pte_lock(h, mm, pte);
>> +	entry = huge_ptep_get(pte);
>> +	if (pte_present(entry)) {
>> +		page = pte_page(entry);
>> +		/*
>> +		 * try_grab_page() should always succeed here, because we hold
>> +		 * the ptl lock and have verified pte_present().
>> +		 */
>> +		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
>> +			page = NULL;
>> +			goto out;
>> +		}
>> +	} else {
>> +		if (is_hugetlb_entry_migration(entry)) {
>> +			spin_unlock(ptl);
>> +			__migration_entry_wait_huge(pte, ptl);
>> +			goto retry;
>> +		}
>> +		/*
>> +		 * hwpoisoned entry is treated as no_page_table in
>> +		 * follow_page_mask().
>> +		 */
>> +	}
>> +out:
>> +	spin_unlock(ptl);
>> +	return page;
> 
> 
> This is neat and clean enough to not reuse follow_hugetlb_page(). I
> wonder if we want to add some comment to the function how this differs
> to follow_hugetlb_page().
> 
> ... or do we maybe want to rename follow_hugetlb_page() to someting like
> __hugetlb_get_user_pages() to make it clearer in which context it will
> get called?

Sounds reasonable to me.

> I guess it might be feasible in the future to eliminate
> follow_hugetlb_page() and centralizing the faulting code. For now, this
> certainly improves the situation.
>
Mike Kravetz Aug. 25, 2022, 6:30 p.m. UTC | #20
On 08/25/22 09:43, Baolin Wang wrote:
> 
> 
> On 8/25/2022 7:34 AM, Mike Kravetz wrote:
> > On 08/24/22 17:41, Baolin Wang wrote:
> > > 
> > > 
> > > On 8/24/2022 3:31 PM, David Hildenbrand wrote:
> > > > > > > > 
> > > > > > > > IMHO, these follow_huge_xxx() functions are arch-specified at first and
> > > > > > > > were moved into the common hugetlb.c by commit 9e5fc74c3025 ("mm:
> > > > > > > > hugetlb: Copy general hugetlb code from x86 to mm"), and now there are
> > > > > > > > still some arch-specified follow_huge_xxx() definition, for example:
> > > > > > > > ia64: follow_huge_addr
> > > > > > > > powerpc: follow_huge_pd
> > > > > > > > s390: follow_huge_pud
> > > > > > > > 
> > > > > > > > What I mean is that follow_hugetlb_page() is a common and
> > > > > > > > not-arch-specified function, is it suitable to change it to be
> > > > > > > > arch-specified?
> > > > > > > > And thinking more, can we rename follow_hugetlb_page() as
> > > > > > > > hugetlb_page_faultin() and simplify it to only handle the page faults of
> > > > > > > > hugetlb like the faultin_page() for normal page? That means we can make
> > > > > > > > sure only follow_page_mask() can handle hugetlb.
> > > > > > > > 
> > > > > > 
> > > > > > Something like that might work, but you still have two page table walkers
> > > > > > for hugetlb.  I like David's idea (if I understand it correctly) of
> > > > > 
> > > > > What I mean is we may change the hugetlb handling like normal page:
> > > > > 1) use follow_page_mask() to look up a hugetlb firstly.
> > > > > 2) if can not get the hugetlb, then try to page fault by
> > > > > hugetlb_page_faultin().
> > > > > 3) if page fault successed, then retry to find hugetlb by
> > > > > follow_page_mask().
> > > > 
> > > > That implies putting more hugetlbfs special code into generic GUP,
> > > > turning it even more complicated. But of course, it depends on how the
> > > > end result looks like. My gut feeling was that hugetlb is better handled
> > > > in follow_hugetlb_page() separately (just like we do with a lot of other
> > > > page table walkers).
> > > 
> > > OK, fair enough.
> > > 
> > > > > 
> > > > > Just a rough thought, and I need more investigation for my idea and
> > > > > David's idea.
> > > > > 
> > > > > > using follow_hugetlb_page for both cases.  As noted, it will need to be
> > > > > > taught how to not trigger faults in the follow_page_mask case.
> > > > > 
> > > > > Anyway, I also agree we need some cleanup, and firstly I think we should
> > > > > cleanup these arch-specified follow_huge_xxx() on some architectures
> > > > > which are similar with the common ones. I will look into these.
> > > > 
> > > > There was a recent discussion on that, e.g.:
> > > > 
> > > > https://lkml.kernel.org/r/20220818135717.609eef8a@thinkpad
> > > 
> > > Thanks.
> > > 
> > > > 
> > > > > 
> > > > > However, considering cleanup may need more investigation and
> > > > > refactoring, now I prefer to make these bug-fix patches of this patchset
> > > > > into mainline firstly, which are suitable to backport to old version to
> > > > > fix potential race issues. Mike and David, how do you think? Could you
> > > > > help to review these patches? Thanks.
> > > > 
> > > > Patch #1 certainly add more special code just to handle another hugetlb
> > > > corner case (CONT pages), and maybe just making it all use
> > > > follow_hugetlb_page() would be even cleaner and less error prone.
> > > > 
> > > > I agree that locking is shaky, but I'm not sure if we really want to
> > > > backport this to stable trees:
> > > > 
> > > > https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
> > > > 
> > > > "It must fix a real bug that bothers people (not a, “This could be a
> > > > problem...” type thing)."
> > > > 
> > > > 
> > > > Do we actually have any instance of this being a real (and not a
> > > > theoretical) problem? If not, I'd rather clean it all up right away.
> > > 
> > > I think this is a real problem (not theoretical), and easy to write some
> > > code to show the issue. For example, suppose thread A is trying to look up a
> > > CONT-PTE size hugetlb page under the lock, however antoher thread B can
> > > migrate the CONT-PTE hugetlb page at the same time, which will cause thread
> > > A to get an incorrect page, if thread A want to do something for this
> > > incorrect page, error occurs.
> > 
> > Is the primary concern the locking?  If so, I am not sure we have an issue.
> 
> Yes.
> 
> > As mentioned in your commit message, current code will use
> > pte_offset_map_lock().  pte_offset_map_lock uses pte_lockptr, and pte_lockptr
> > will either be the mm wide lock or pmd_page lock.  To me, it seems that
> 
> The ALLOC_SPLIT_PTLOCKS can be always true on my machine, that means the
> pte_lockptr() will always use the PTE page lock, however huge_pte_lock()
> will use the mm wide lock.

Yes, the different calling context/path to the locking code will cause a
different lock to be used.  I thought of the AFTER sending the above.

> 
> > either would provide correct synchronization for CONT-PTE entries.  Am I
> > missing something or misreading the code?
> > 
> > I started looking at code cleanup suggested by David.  Here is a quick
> > patch (not tested and likely containing errors) to see if this is a step
> > in the right direction.
> > 
> > I like it because we get rid of/combine all those follow_huge_p*d
> > routines.
> 
> Great, this looks straight forward to me (some nits as below).
> David, how do you think?
> 

I will continue to refine this based on suggestions from you and David.

> > +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
> > +				unsigned long address, unsigned int flags)
> > +{
> > +	struct hstate *h = hstate_vma(vma);
> > +	struct mm_struct *mm = vma->vm_mm;
> > +	unsigned long haddr = address & huge_page_mask(h);
> > +	struct page *page = NULL;
> > +	spinlock_t *ptl;
> > +	pte_t *pte, entry;
> > +
> > +	/*
> > +	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
> > +	 * follow_hugetlb_page().
> > +	 */
> > +	if (WARN_ON_ONCE(flags & FOLL_PIN))
> > +		return NULL;
> > +
> > +	pte = huge_pte_offset(mm, haddr, huge_page_size(h));
> > +	if (!pte)
> > +		return NULL;
> > +
> > +retry:
> > +	ptl = huge_pte_lock(h, mm, pte);
> > +	entry = huge_ptep_get(pte);
> > +	if (pte_present(entry)) {
> > +		page = pte_page(entry);
> 
> Should follow previous logic?
> page = pte_page(entry) + ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
> 

Yes, this needs to be PAGE aligned, not HUGETLB_PAGE aligned.
Mike Kravetz Aug. 25, 2022, 9:13 p.m. UTC | #21
On 08/25/22 09:25, David Hildenbrand wrote:
> > Is the primary concern the locking?  If so, I am not sure we have an issue.
> > As mentioned in your commit message, current code will use
> > pte_offset_map_lock().  pte_offset_map_lock uses pte_lockptr, and pte_lockptr
> > will either be the mm wide lock or pmd_page lock.  To me, it seems that
> > either would provide correct synchronization for CONT-PTE entries.  Am I
> > missing something or misreading the code?
> > 
> > I started looking at code cleanup suggested by David.  Here is a quick
> > patch (not tested and likely containing errors) to see if this is a step
> > in the right direction.
> > 
> > I like it because we get rid of/combine all those follow_huge_p*d
> > routines.
> > 
> 
> Yes, see comments below.
> 
> > From 35d117a707c1567ddf350554298697d40eace0d7 Mon Sep 17 00:00:00 2001
> > From: Mike Kravetz <mike.kravetz@oracle.com>
> > Date: Wed, 24 Aug 2022 15:59:15 -0700
> > Subject: [PATCH] hugetlb: call hugetlb_follow_page_mask for hugetlb pages in
> >  follow_page_mask
> > 
> > At the beginning of follow_page_mask, there currently is a call to
> > follow_huge_addr which 'may' handle hugetlb pages.  ia64 is the only
> > architecture which (incorrectly) provides a follow_huge_addr routine
> > that does not return error.  Instead, at each level of the page table a
> > check is made for a hugetlb entry.  If a hugetlb entry is found, a call
> > to a routine associated with that page table level such as
> > follow_huge_pmd is made.
> > 
> > All the follow_huge_p*d routines are basically the same.  In addition
> > huge page size can be derived from the vma, so we know where in the page
> > table a huge page would reside.  So, replace follow_huge_addr with a
> > new architecture independent routine which will provide the same
> > functionality as the follow_huge_p*d routines.  We can then eliminate
> > the p*d_huge checks in follow_page_mask page table walking as well as
> > the follow_huge_p*d routines themselves.>
> > follow_page_mask still has is_hugepd hugetlb checks during page table
> > walking.  This is due to these checks and follow_huge_pd being
> > architecture specific.  These can be eliminated if
> > hugetlb_follow_page_mask can be overwritten by architectures (powerpc)
> > that need to do follow_huge_pd processing.
> 
> But won't the
> 
> > +	/* hugetlb is special */
> > +	if (is_vm_hugetlb_page(vma))
> > +		return hugetlb_follow_page_mask(vma, address, flags);
> 
> code route everything via hugetlb_follow_page_mask() and all these
> (beloved) hugepd checks would essentially be unreachable?
> 
> At least my understanding is that hugepd only applies to hugetlb.
> 
> Can't we move the hugepd handling code into hugetlb_follow_page_mask()
> as well?
> 
> I mean, doesn't follow_hugetlb_page() also have to handle that hugepd
> stuff already ... ?
> 
> [...]

I think so, but I got a little confused looking at the hugepd handling code.
Adding Aneesh who added support to follow_page_mask in the series at:
https://lore.kernel.org/linux-mm/1494926612-23928-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com/

I believe you are correct in that follow_hugetlb_page must handle as well.

One source of my confusion is the following in follow_huge_pd:

	/*
	 * hugepage directory entries are protected by mm->page_table_lock
	 * Use this instead of huge_pte_lockptr
	 */
	ptl = &mm->page_table_lock;
	spin_lock(ptl);

Yet, if follow_hugetlb_page handles hugepd then it is using huge_pte_lockptr
to get the lock pointer and is wrong?

Hoping Aneesh can help clear up the confusion.

BTW, I also noticed that the above series added the comment:
	/* make this handle hugepd */
above the call to follow_huge_addr() in follow_page_mask.  Perhaps there
was at one time a plan to have follow_huge_addr handle hugepd?  That
series removed powerpc specific follow_huge_addr routine.
Mike Kravetz Aug. 26, 2022, 10:40 p.m. UTC | #22
On 08/25/22 14:13, Mike Kravetz wrote:
> On 08/25/22 09:25, David Hildenbrand wrote:
> > > Is the primary concern the locking?  If so, I am not sure we have an issue.
> > > As mentioned in your commit message, current code will use
> > > pte_offset_map_lock().  pte_offset_map_lock uses pte_lockptr, and pte_lockptr
> > > will either be the mm wide lock or pmd_page lock.  To me, it seems that
> > > either would provide correct synchronization for CONT-PTE entries.  Am I
> > > missing something or misreading the code?
> > > 
> > > I started looking at code cleanup suggested by David.  Here is a quick
> > > patch (not tested and likely containing errors) to see if this is a step
> > > in the right direction.
> > > 
> > > I like it because we get rid of/combine all those follow_huge_p*d
> > > routines.
> > > 
> > 
> > Yes, see comments below.
> > 
> > > From 35d117a707c1567ddf350554298697d40eace0d7 Mon Sep 17 00:00:00 2001
> > > From: Mike Kravetz <mike.kravetz@oracle.com>
> > > Date: Wed, 24 Aug 2022 15:59:15 -0700
> > > Subject: [PATCH] hugetlb: call hugetlb_follow_page_mask for hugetlb pages in
> > >  follow_page_mask
> > > 
> > > At the beginning of follow_page_mask, there currently is a call to
> > > follow_huge_addr which 'may' handle hugetlb pages.  ia64 is the only
> > > architecture which (incorrectly) provides a follow_huge_addr routine
> > > that does not return error.  Instead, at each level of the page table a
> > > check is made for a hugetlb entry.  If a hugetlb entry is found, a call
> > > to a routine associated with that page table level such as
> > > follow_huge_pmd is made.
> > > 
> > > All the follow_huge_p*d routines are basically the same.  In addition
> > > huge page size can be derived from the vma, so we know where in the page
> > > table a huge page would reside.  So, replace follow_huge_addr with a
> > > new architecture independent routine which will provide the same
> > > functionality as the follow_huge_p*d routines.  We can then eliminate
> > > the p*d_huge checks in follow_page_mask page table walking as well as
> > > the follow_huge_p*d routines themselves.>
> > > follow_page_mask still has is_hugepd hugetlb checks during page table
> > > walking.  This is due to these checks and follow_huge_pd being
> > > architecture specific.  These can be eliminated if
> > > hugetlb_follow_page_mask can be overwritten by architectures (powerpc)
> > > that need to do follow_huge_pd processing.
> > 
> > But won't the
> > 
> > > +	/* hugetlb is special */
> > > +	if (is_vm_hugetlb_page(vma))
> > > +		return hugetlb_follow_page_mask(vma, address, flags);
> > 
> > code route everything via hugetlb_follow_page_mask() and all these
> > (beloved) hugepd checks would essentially be unreachable?
> > 
> > At least my understanding is that hugepd only applies to hugetlb.
> > 
> > Can't we move the hugepd handling code into hugetlb_follow_page_mask()
> > as well?
> > 
> > I mean, doesn't follow_hugetlb_page() also have to handle that hugepd
> > stuff already ... ?
> > 
> > [...]
> 
> I think so, but I got a little confused looking at the hugepd handling code.
> Adding Aneesh who added support to follow_page_mask in the series at:
> https://lore.kernel.org/linux-mm/1494926612-23928-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com/
> 
> I believe you are correct in that follow_hugetlb_page must handle as well.
> 
> One source of my confusion is the following in follow_huge_pd:
> 
> 	/*
> 	 * hugepage directory entries are protected by mm->page_table_lock
> 	 * Use this instead of huge_pte_lockptr
> 	 */
> 	ptl = &mm->page_table_lock;
> 	spin_lock(ptl);
> 
> Yet, if follow_hugetlb_page handles hugepd then it is using huge_pte_lockptr
> to get the lock pointer and is wrong?
> 
> Hoping Aneesh can help clear up the confusion.
> 

Along those same lines, follow_huge_pd looks very much like the
existing follow_huge_p*d routines.  One difference is that follow_huge_pd
gets the huge page size from the page table entry (hugepd_shift).
However, we are called for a specific mapping (vma), and encoded within
the vma is the hugetlb page size.  Can this be different?  My first
thought is no, because the size encoded in the vma is use for page fault
handling.

This seems a bit confusing.  If huge page size for huge_pd entries is
indeed the same as that encoded in the vma, then something like the
proposed hugetlb_follow_page_mask should handle huge_pd entries as well.
Aneesh Kumar K.V Aug. 27, 2022, 1:59 p.m. UTC | #23
Mike Kravetz <mike.kravetz@oracle.com> writes:

>
> On 08/25/22 09:25, David Hildenbrand wrote:
>> > Is the primary concern the locking?  If so, I am not sure we have an issue.
>> > As mentioned in your commit message, current code will use
>> > pte_offset_map_lock().  pte_offset_map_lock uses pte_lockptr, and pte_lockptr
>> > will either be the mm wide lock or pmd_page lock.  To me, it seems that
>> > either would provide correct synchronization for CONT-PTE entries.  Am I
>> > missing something or misreading the code?
>> > 
>> > I started looking at code cleanup suggested by David.  Here is a quick
>> > patch (not tested and likely containing errors) to see if this is a step
>> > in the right direction.
>> > 
>> > I like it because we get rid of/combine all those follow_huge_p*d
>> > routines.
>> > 
>> 
>> Yes, see comments below.
>> 
>> > From 35d117a707c1567ddf350554298697d40eace0d7 Mon Sep 17 00:00:00 2001
>> > From: Mike Kravetz <mike.kravetz@oracle.com>
>> > Date: Wed, 24 Aug 2022 15:59:15 -0700
>> > Subject: [PATCH] hugetlb: call hugetlb_follow_page_mask for hugetlb pages in
>> >  follow_page_mask
>> > 
>> > At the beginning of follow_page_mask, there currently is a call to
>> > follow_huge_addr which 'may' handle hugetlb pages.  ia64 is the only
>> > architecture which (incorrectly) provides a follow_huge_addr routine
>> > that does not return error.  Instead, at each level of the page table a
>> > check is made for a hugetlb entry.  If a hugetlb entry is found, a call
>> > to a routine associated with that page table level such as
>> > follow_huge_pmd is made.
>> > 
>> > All the follow_huge_p*d routines are basically the same.  In addition
>> > huge page size can be derived from the vma, so we know where in the page
>> > table a huge page would reside.  So, replace follow_huge_addr with a
>> > new architecture independent routine which will provide the same
>> > functionality as the follow_huge_p*d routines.  We can then eliminate
>> > the p*d_huge checks in follow_page_mask page table walking as well as
>> > the follow_huge_p*d routines themselves.>
>> > follow_page_mask still has is_hugepd hugetlb checks during page table
>> > walking.  This is due to these checks and follow_huge_pd being
>> > architecture specific.  These can be eliminated if
>> > hugetlb_follow_page_mask can be overwritten by architectures (powerpc)
>> > that need to do follow_huge_pd processing.
>> 
>> But won't the
>> 
>> > +	/* hugetlb is special */
>> > +	if (is_vm_hugetlb_page(vma))
>> > +		return hugetlb_follow_page_mask(vma, address, flags);
>> 
>> code route everything via hugetlb_follow_page_mask() and all these
>> (beloved) hugepd checks would essentially be unreachable?
>> 
>> At least my understanding is that hugepd only applies to hugetlb.
>> 
>> Can't we move the hugepd handling code into hugetlb_follow_page_mask()
>> as well?
>> 
>> I mean, doesn't follow_hugetlb_page() also have to handle that hugepd
>> stuff already ... ?
>> 
>> [...]
>
> I think so, but I got a little confused looking at the hugepd handling code.
> Adding Aneesh who added support to follow_page_mask in the series at:
> https://lore.kernel.org/linux-mm/1494926612-23928-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com/
>
> I believe you are correct in that follow_hugetlb_page must handle as well.
>
> One source of my confusion is the following in follow_huge_pd:
>
> 	/*
> 	 * hugepage directory entries are protected by mm->page_table_lock
> 	 * Use this instead of huge_pte_lockptr
> 	 */
> 	ptl = &mm->page_table_lock;
> 	spin_lock(ptl);
>
> Yet, if follow_hugetlb_page handles hugepd then it is using huge_pte_lockptr
> to get the lock pointer and is wrong?
>
> Hoping Aneesh can help clear up the confusion.


I agree it is all confusing. At some point, the goal was to teach
generic kernel page table walking code about hugepd entries. But looking
at this again and considering we only have hugepd entries for hugetlb,
may be the effort is not worth the complexity it adds.

ie,  instead of teaching generic page table walk about different hugetlb
page table layouts we special case using is_vm_hugetlb_page(vma)
wherever we can.

With respect to huge_pte_lockptr, it is tricky (hugepd entries are not
PMD_SIZE) 

static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
					   struct mm_struct *mm, pte_t *pte)
{
	if (huge_page_size(h) == PMD_SIZE)
		return pmd_lockptr(mm, (pmd_t *) pte);
	VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
	return &mm->page_table_lock;
}

>
> BTW, I also noticed that the above series added the comment:
> 	/* make this handle hugepd */
> above the call to follow_huge_addr() in follow_page_mask.  Perhaps there
> was at one time a plan to have follow_huge_addr handle hugepd?  That
> series removed powerpc specific follow_huge_addr routine.
>
> -- 
> Mike Kravetz
Mike Kravetz Aug. 29, 2022, 6:30 p.m. UTC | #24
On 08/27/22 19:29, Aneesh Kumar K.V wrote:
> Mike Kravetz <mike.kravetz@oracle.com> writes:
> 
> >
> > On 08/25/22 09:25, David Hildenbrand wrote:
> >> > Is the primary concern the locking?  If so, I am not sure we have an issue.
> >> > As mentioned in your commit message, current code will use
> >> > pte_offset_map_lock().  pte_offset_map_lock uses pte_lockptr, and pte_lockptr
> >> > will either be the mm wide lock or pmd_page lock.  To me, it seems that
> >> > either would provide correct synchronization for CONT-PTE entries.  Am I
> >> > missing something or misreading the code?
> >> > 
> >> > I started looking at code cleanup suggested by David.  Here is a quick
> >> > patch (not tested and likely containing errors) to see if this is a step
> >> > in the right direction.
> >> > 
> >> > I like it because we get rid of/combine all those follow_huge_p*d
> >> > routines.
> >> > 
> >> 
> >> Yes, see comments below.
> >> 
> >> > From 35d117a707c1567ddf350554298697d40eace0d7 Mon Sep 17 00:00:00 2001
> >> > From: Mike Kravetz <mike.kravetz@oracle.com>
> >> > Date: Wed, 24 Aug 2022 15:59:15 -0700
> >> > Subject: [PATCH] hugetlb: call hugetlb_follow_page_mask for hugetlb pages in
> >> >  follow_page_mask
> >> > 
> >> > At the beginning of follow_page_mask, there currently is a call to
> >> > follow_huge_addr which 'may' handle hugetlb pages.  ia64 is the only
> >> > architecture which (incorrectly) provides a follow_huge_addr routine
> >> > that does not return error.  Instead, at each level of the page table a
> >> > check is made for a hugetlb entry.  If a hugetlb entry is found, a call
> >> > to a routine associated with that page table level such as
> >> > follow_huge_pmd is made.
> >> > 
> >> > All the follow_huge_p*d routines are basically the same.  In addition
> >> > huge page size can be derived from the vma, so we know where in the page
> >> > table a huge page would reside.  So, replace follow_huge_addr with a
> >> > new architecture independent routine which will provide the same
> >> > functionality as the follow_huge_p*d routines.  We can then eliminate
> >> > the p*d_huge checks in follow_page_mask page table walking as well as
> >> > the follow_huge_p*d routines themselves.>
> >> > follow_page_mask still has is_hugepd hugetlb checks during page table
> >> > walking.  This is due to these checks and follow_huge_pd being
> >> > architecture specific.  These can be eliminated if
> >> > hugetlb_follow_page_mask can be overwritten by architectures (powerpc)
> >> > that need to do follow_huge_pd processing.
> >> 
> >> But won't the
> >> 
> >> > +	/* hugetlb is special */
> >> > +	if (is_vm_hugetlb_page(vma))
> >> > +		return hugetlb_follow_page_mask(vma, address, flags);
> >> 
> >> code route everything via hugetlb_follow_page_mask() and all these
> >> (beloved) hugepd checks would essentially be unreachable?
> >> 
> >> At least my understanding is that hugepd only applies to hugetlb.
> >> 
> >> Can't we move the hugepd handling code into hugetlb_follow_page_mask()
> >> as well?
> >> 
> >> I mean, doesn't follow_hugetlb_page() also have to handle that hugepd
> >> stuff already ... ?
> >> 
> >> [...]
> >
> > I think so, but I got a little confused looking at the hugepd handling code.
> > Adding Aneesh who added support to follow_page_mask in the series at:
> > https://lore.kernel.org/linux-mm/1494926612-23928-1-git-send-email-aneesh.kumar@linux.vnet.ibm.com/
> >
> > I believe you are correct in that follow_hugetlb_page must handle as well.
> >
> > One source of my confusion is the following in follow_huge_pd:
> >
> > 	/*
> > 	 * hugepage directory entries are protected by mm->page_table_lock
> > 	 * Use this instead of huge_pte_lockptr
> > 	 */
> > 	ptl = &mm->page_table_lock;
> > 	spin_lock(ptl);
> >
> > Yet, if follow_hugetlb_page handles hugepd then it is using huge_pte_lockptr
> > to get the lock pointer and is wrong?
> >
> > Hoping Aneesh can help clear up the confusion.
> 
> 
> I agree it is all confusing. At some point, the goal was to teach
> generic kernel page table walking code about hugepd entries. But looking
> at this again and considering we only have hugepd entries for hugetlb,
> may be the effort is not worth the complexity it adds.
> 
> ie,  instead of teaching generic page table walk about different hugetlb
> page table layouts we special case using is_vm_hugetlb_page(vma)
> wherever we can.

Thanks for your comments Aneesh.

I give David credit for suggesting that is would be simpler to just special
case hugetlb mappings here.  Based on your comments, I believe an arch
independent 'hugetlb_follow_page_mask' routine would handle all cases and
we can remove the 'if (p*d_huge)' blocks and 'if (is_hugepd)' blocks of
code from follow_page_mask.

Such a routine would call huge_pte_offset() which can be/is overwritten by
arch specific code.  In fact, the powerpc version of this already handles
hugepd entries.

> With respect to huge_pte_lockptr, it is tricky (hugepd entries are not
> PMD_SIZE) 
> 
> static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
> 					   struct mm_struct *mm, pte_t *pte)
> {
> 	if (huge_page_size(h) == PMD_SIZE)
> 		return pmd_lockptr(mm, (pmd_t *) pte);
> 	VM_BUG_ON(huge_page_size(h) == PAGE_SIZE);
> 	return &mm->page_table_lock;
> }

Ok, so I think you confirmed that huge_pte_lockptr would work for hugepd
entries as they are never PMD_SIZE.

I will be sure to cc you on the proposed changes.  Thanks,
diff mbox series

Patch

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3ec981a..d491138 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -207,6 +207,8 @@  struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
 struct page *follow_huge_pd(struct vm_area_struct *vma,
 			    unsigned long address, hugepd_t hpd,
 			    int flags, int pdshift);
+struct page *follow_huge_pte(struct vm_area_struct *vma, unsigned long address,
+			     pmd_t *pmd, int flags);
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 				pmd_t *pmd, int flags);
 struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
@@ -312,6 +314,12 @@  static inline struct page *follow_huge_pd(struct vm_area_struct *vma,
 	return NULL;
 }
 
+static inline struct page *follow_huge_pte(struct vm_area_struct *vma,
+				unsigned long address, pmd_t *pmd, int flags)
+{
+	return NULL;
+}
+
 static inline struct page *follow_huge_pmd(struct mm_struct *mm,
 				unsigned long address, pmd_t *pmd, int flags)
 {
diff --git a/mm/gup.c b/mm/gup.c
index 3b656b7..87a94f5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -534,6 +534,17 @@  static struct page *follow_page_pte(struct vm_area_struct *vma,
 	if (unlikely(pmd_bad(*pmd)))
 		return no_page_table(vma, flags);
 
+	/*
+	 * Considering PTE level hugetlb, like continuous-PTE hugetlb on
+	 * ARM64 architecture.
+	 */
+	if (is_vm_hugetlb_page(vma)) {
+		page = follow_huge_pte(vma, address, pmd, flags);
+		if (page)
+			return page;
+		return no_page_table(vma, flags);
+	}
+
 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 	pte = *ptep;
 	if (!pte_present(pte)) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6c00ba1..cf742d1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6981,6 +6981,59 @@  struct page * __weak
 	return NULL;
 }
 
+/* Support looking up a CONT-PTE size hugetlb page. */
+struct page * __weak
+follow_huge_pte(struct vm_area_struct *vma, unsigned long address,
+		pmd_t *pmd, int flags)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct hstate *hstate = hstate_vma(vma);
+	unsigned long size = huge_page_size(hstate);
+	struct page *page = NULL;
+	spinlock_t *ptl;
+	pte_t *ptep, pte;
+
+	/*
+	 * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via
+	 * follow_hugetlb_page().
+	 */
+	if (WARN_ON_ONCE(flags & FOLL_PIN))
+		return NULL;
+
+	ptep = huge_pte_offset(mm, address, size);
+	if (!ptep)
+		return NULL;
+
+retry:
+	ptl = huge_pte_lock(hstate, mm, ptep);
+	pte = huge_ptep_get(ptep);
+	if (pte_present(pte)) {
+		page = pte_page(pte);
+		if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+			page = NULL;
+			goto out;
+		}
+	} else {
+		if (!(flags & FOLL_MIGRATION)) {
+			page = NULL;
+			goto out;
+		}
+
+		if (is_hugetlb_entry_migration(pte)) {
+			spin_unlock(ptl);
+			__migration_entry_wait_huge(ptep, ptl);
+			goto retry;
+		}
+		/*
+		 * hwpoisoned entry is treated as no_page_table in
+		 * follow_page_mask().
+		 */
+	}
+out:
+	spin_unlock(ptl);
+	return page;
+}
+
 struct page * __weak
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 		pmd_t *pmd, int flags)