diff mbox series

[v1,1/2] mm: let pte_lockptr() consume a pte_t pointer

Message ID 20240725183955.2268884-2-david@redhat.com (mailing list archive)
State New
Headers show
Series mm/hugetlb: fix hugetlb vs. core-mm PT locking | expand

Commit Message

David Hildenbrand July 25, 2024, 6:39 p.m. UTC
pte_lockptr() is the only *_lockptr() function that doesn't consume
what would be expected: it consumes a pmd_t pointer instead of a pte_t
pointer.

Let's change that. The two callers in pgtable-generic.c are easily
adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
pte_offset_map_nolock() to obtain the lock, even though we won't actually
be traversing the page table.

This makes the code more similar to the other variants and avoids other
hacks to make the new pte_lockptr() version happy. pte_lockptr() users
reside now only in  pgtable-generic.c.

Maybe, using pte_offset_map_nolock() is the right thing to do because
the PTE table could have been removed in the meantime? At least it sounds
more future proof if we ever have other means of page table reclaim.

It's not quite clear if holding the PTE table lock is really required:
what if someone else obtains the lock just after we unlock it? But we'll
leave that as is for now, maybe there are good reasons.

This is a preparation for adapting hugetlb page table locking logic to
take the same locks as core-mm page table walkers would.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 include/linux/mm.h   |  7 ++++---
 mm/khugepaged.c      | 21 +++++++++++++++------
 mm/pgtable-generic.c |  4 ++--
 3 files changed, 21 insertions(+), 11 deletions(-)

Comments

Peter Xu July 26, 2024, 3:36 p.m. UTC | #1
On Thu, Jul 25, 2024 at 08:39:54PM +0200, David Hildenbrand wrote:
> pte_lockptr() is the only *_lockptr() function that doesn't consume
> what would be expected: it consumes a pmd_t pointer instead of a pte_t
> pointer.
> 
> Let's change that. The two callers in pgtable-generic.c are easily
> adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
> pte_offset_map_nolock() to obtain the lock, even though we won't actually
> be traversing the page table.
> 
> This makes the code more similar to the other variants and avoids other
> hacks to make the new pte_lockptr() version happy. pte_lockptr() users
> reside now only in  pgtable-generic.c.
> 
> Maybe, using pte_offset_map_nolock() is the right thing to do because
> the PTE table could have been removed in the meantime? At least it sounds
> more future proof if we ever have other means of page table reclaim.

I think it can't change, because anyone who wants to race against this
should try to take the pmd lock first (which was held already)?

I wonder an open coded "ptlock_ptr(page_ptdesc(pmd_page(*pmd)))" would be
nicer here, but only if my understanding is correct.

Thanks,

> 
> It's not quite clear if holding the PTE table lock is really required:
> what if someone else obtains the lock just after we unlock it? But we'll
> leave that as is for now, maybe there are good reasons.
> 
> This is a preparation for adapting hugetlb page table locking logic to
> take the same locks as core-mm page table walkers would.
> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  include/linux/mm.h   |  7 ++++---
>  mm/khugepaged.c      | 21 +++++++++++++++------
>  mm/pgtable-generic.c |  4 ++--
>  3 files changed, 21 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 2c6ccf088c7be..0472a5090b180 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2873,9 +2873,10 @@ static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
>  }
>  #endif /* ALLOC_SPLIT_PTLOCKS */
>  
> -static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
> +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pte_t *pte)
>  {
> -	return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
> +	/* PTE page tables don't currently exceed a single page. */
> +	return ptlock_ptr(virt_to_ptdesc(pte));
>  }
>  
>  static inline bool ptlock_init(struct ptdesc *ptdesc)
> @@ -2898,7 +2899,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc)
>  /*
>   * We use mm->page_table_lock to guard all pagetable pages of the mm.
>   */
> -static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
> +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pte_t *pte)
>  {
>  	return &mm->page_table_lock;
>  }
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index cdd1d8655a76b..f3b3db1046155 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1697,12 +1697,13 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>  	i_mmap_lock_read(mapping);
>  	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
>  		struct mmu_notifier_range range;
> +		bool retracted = false;
>  		struct mm_struct *mm;
>  		unsigned long addr;
>  		pmd_t *pmd, pgt_pmd;
>  		spinlock_t *pml;
>  		spinlock_t *ptl;
> -		bool skipped_uffd = false;
> +		pte_t *pte;
>  
>  		/*
>  		 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
> @@ -1739,9 +1740,17 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>  		mmu_notifier_invalidate_range_start(&range);
>  
>  		pml = pmd_lock(mm, pmd);
> -		ptl = pte_lockptr(mm, pmd);
> +
> +		/*
> +		 * No need to check the PTE table content, but we'll grab the
> +		 * PTE table lock while we zap it.
> +		 */
> +		pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
> +		if (!pte)
> +			goto unlock_pmd;
>  		if (ptl != pml)
>  			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
> +		pte_unmap(pte);
>  
>  		/*
>  		 * Huge page lock is still held, so normally the page table
> @@ -1752,20 +1761,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>  		 * repeating the anon_vma check protects from one category,
>  		 * and repeating the userfaultfd_wp() check from another.
>  		 */
> -		if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
> -			skipped_uffd = true;
> -		} else {
> +		if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
>  			pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
>  			pmdp_get_lockless_sync();
> +			retracted = true;
>  		}
>  
>  		if (ptl != pml)
>  			spin_unlock(ptl);
> +unlock_pmd:
>  		spin_unlock(pml);
>  
>  		mmu_notifier_invalidate_range_end(&range);
>  
> -		if (!skipped_uffd) {
> +		if (retracted) {
>  			mm_dec_nr_ptes(mm);
>  			page_table_check_pte_clear_range(mm, addr, pgt_pmd);
>  			pte_free_defer(mm, pmd_pgtable(pgt_pmd));
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index a78a4adf711ac..13a7705df3f87 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -313,7 +313,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
>  
>  	pte = __pte_offset_map(pmd, addr, &pmdval);
>  	if (likely(pte))
> -		*ptlp = pte_lockptr(mm, &pmdval);
> +		*ptlp = pte_lockptr(mm, pte);
>  	return pte;
>  }
>  
> @@ -371,7 +371,7 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
>  	pte = __pte_offset_map(pmd, addr, &pmdval);
>  	if (unlikely(!pte))
>  		return pte;
> -	ptl = pte_lockptr(mm, &pmdval);
> +	ptl = pte_lockptr(mm, pte);
>  	spin_lock(ptl);
>  	if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
>  		*ptlp = ptl;
> -- 
> 2.45.2
>
David Hildenbrand July 26, 2024, 4:02 p.m. UTC | #2
On 26.07.24 17:36, Peter Xu wrote:
> On Thu, Jul 25, 2024 at 08:39:54PM +0200, David Hildenbrand wrote:
>> pte_lockptr() is the only *_lockptr() function that doesn't consume
>> what would be expected: it consumes a pmd_t pointer instead of a pte_t
>> pointer.
>>
>> Let's change that. The two callers in pgtable-generic.c are easily
>> adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
>> pte_offset_map_nolock() to obtain the lock, even though we won't actually
>> be traversing the page table.
>>
>> This makes the code more similar to the other variants and avoids other
>> hacks to make the new pte_lockptr() version happy. pte_lockptr() users
>> reside now only in  pgtable-generic.c.
>>
>> Maybe, using pte_offset_map_nolock() is the right thing to do because
>> the PTE table could have been removed in the meantime? At least it sounds
>> more future proof if we ever have other means of page table reclaim.
> 
> I think it can't change, because anyone who wants to race against this
> should try to take the pmd lock first (which was held already)?

That doesn't explain why it is safe for us to assume that after we took 
the PMD lock that the PMD actually still points at a completely empty 
page table. Likely it currently works by accident, because we only have 
a single such user that makes this assumption. It might certainly be a 
different once we asynchronously reclaim page tables.

But yes, the PMD cannot get modified while we hold the PMD lock, 
otherwise we'd be in trouble

> 
> I wonder an open coded "ptlock_ptr(page_ptdesc(pmd_page(*pmd)))" would be
> nicer here, but only if my understanding is correct.

I really don't like open-coding that. Fortunately we were able to limit 
the use of ptlock_ptr to a single user outside of arch/x86/xen/mmu_pv.c 
so far.
Peter Xu July 26, 2024, 9:28 p.m. UTC | #3
On Fri, Jul 26, 2024 at 06:02:17PM +0200, David Hildenbrand wrote:
> On 26.07.24 17:36, Peter Xu wrote:
> > On Thu, Jul 25, 2024 at 08:39:54PM +0200, David Hildenbrand wrote:
> > > pte_lockptr() is the only *_lockptr() function that doesn't consume
> > > what would be expected: it consumes a pmd_t pointer instead of a pte_t
> > > pointer.
> > > 
> > > Let's change that. The two callers in pgtable-generic.c are easily
> > > adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
> > > pte_offset_map_nolock() to obtain the lock, even though we won't actually
> > > be traversing the page table.
> > > 
> > > This makes the code more similar to the other variants and avoids other
> > > hacks to make the new pte_lockptr() version happy. pte_lockptr() users
> > > reside now only in  pgtable-generic.c.
> > > 
> > > Maybe, using pte_offset_map_nolock() is the right thing to do because
> > > the PTE table could have been removed in the meantime? At least it sounds
> > > more future proof if we ever have other means of page table reclaim.
> > 
> > I think it can't change, because anyone who wants to race against this
> > should try to take the pmd lock first (which was held already)?
> 
> That doesn't explain why it is safe for us to assume that after we took the
> PMD lock that the PMD actually still points at a completely empty page
> table. Likely it currently works by accident, because we only have a single
> such user that makes this assumption. It might certainly be a different once
> we asynchronously reclaim page tables.

I think it's safe because find_pmd_or_thp_or_none() returned SUCCEED, and
we're holding i_mmap lock for read.  I don't see any way that this pmd can
become a non-pgtable-page.

I meant, AFAIU tearing down pgtable in whatever sane way will need to at
least take both mmap write lock and i_mmap write lock (in this case, a file
mapping), no?

> 
> But yes, the PMD cannot get modified while we hold the PMD lock, otherwise
> we'd be in trouble
> 
> > 
> > I wonder an open coded "ptlock_ptr(page_ptdesc(pmd_page(*pmd)))" would be
> > nicer here, but only if my understanding is correct.
> 
> I really don't like open-coding that. Fortunately we were able to limit the
> use of ptlock_ptr to a single user outside of arch/x86/xen/mmu_pv.c so far.

I'm fine if you prefer like that; I don't see it a huge deal to me.

Thanks,
David Hildenbrand July 26, 2024, 9:48 p.m. UTC | #4
On 26.07.24 23:28, Peter Xu wrote:
> On Fri, Jul 26, 2024 at 06:02:17PM +0200, David Hildenbrand wrote:
>> On 26.07.24 17:36, Peter Xu wrote:
>>> On Thu, Jul 25, 2024 at 08:39:54PM +0200, David Hildenbrand wrote:
>>>> pte_lockptr() is the only *_lockptr() function that doesn't consume
>>>> what would be expected: it consumes a pmd_t pointer instead of a pte_t
>>>> pointer.
>>>>
>>>> Let's change that. The two callers in pgtable-generic.c are easily
>>>> adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
>>>> pte_offset_map_nolock() to obtain the lock, even though we won't actually
>>>> be traversing the page table.
>>>>
>>>> This makes the code more similar to the other variants and avoids other
>>>> hacks to make the new pte_lockptr() version happy. pte_lockptr() users
>>>> reside now only in  pgtable-generic.c.
>>>>
>>>> Maybe, using pte_offset_map_nolock() is the right thing to do because
>>>> the PTE table could have been removed in the meantime? At least it sounds
>>>> more future proof if we ever have other means of page table reclaim.
>>>
>>> I think it can't change, because anyone who wants to race against this
>>> should try to take the pmd lock first (which was held already)?
>>
>> That doesn't explain why it is safe for us to assume that after we took the
>> PMD lock that the PMD actually still points at a completely empty page
>> table. Likely it currently works by accident, because we only have a single
>> such user that makes this assumption. It might certainly be a different once
>> we asynchronously reclaim page tables.
> 
> I think it's safe because find_pmd_or_thp_or_none() returned SUCCEED, and
> we're holding i_mmap lock for read.  I don't see any way that this pmd can
> become a non-pgtable-page.
> 
> I meant, AFAIU tearing down pgtable in whatever sane way will need to at
> least take both mmap write lock and i_mmap write lock (in this case, a file
> mapping), no?

Skimming over [1] where I still owe a review I think we can now do it 
now purely under the read locks, with the PMD lock held.

I think this is also what collapse_pte_mapped_thp() ends up doing: 
replace a PTE table that maps a folio by a PMD (present or none, 
depends) that maps a folio only while holding the mmap lock in read 
mode. Of course, here the table is not empty but we need similar ways of 
making PT walkers aware of concurrent page table retraction.

IIRC, that was the magic added to __pte_offset_map(), such that 
pte_offset_map_nolock/pte_offset_map_lock can fail on races.


But if we hold the PMD lock, nothing should actually change (so far my 
understanding) -- we cannot suddenly rip out a page table.

[1] 
https://lkml.kernel.org/r/cover.1719570849.git.zhengqi.arch@bytedance.com

> 
>>
>> But yes, the PMD cannot get modified while we hold the PMD lock, otherwise
>> we'd be in trouble
>>
>>>
>>> I wonder an open coded "ptlock_ptr(page_ptdesc(pmd_page(*pmd)))" would be
>>> nicer here, but only if my understanding is correct.
>>
>> I really don't like open-coding that. Fortunately we were able to limit the
>> use of ptlock_ptr to a single user outside of arch/x86/xen/mmu_pv.c so far.
> 
> I'm fine if you prefer like that; I don't see it a huge deal to me.

Let's keep it like that, unless we can come up with something neater. At 
least it makes the code also more consistent with similar code in that 
file and the overhead should be  minimal.

I was briefly thinking about actually testing if the PT is full of 
pte_none(), either as a debugging check or to also handle what is 
currently handled via:

if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {

Seems wasteful just because some part of a VMA might have a private page 
mapped / uffd-wp active to let all other parts suffer.

Will think about if that is really worth it.

... also because I still want to understand why the PTL of the PMD table 
is required at all. What if we lock it first and somebody else wants to 
lock it after us while we already ripped it out? Sure there must be some 
reason for the lock, I just don't understand it yet :/.
Qi Zheng July 29, 2024, 6:19 a.m. UTC | #5
Hi David,

On 2024/7/27 05:48, David Hildenbrand wrote:
> On 26.07.24 23:28, Peter Xu wrote:
>> On Fri, Jul 26, 2024 at 06:02:17PM +0200, David Hildenbrand wrote:
>>> On 26.07.24 17:36, Peter Xu wrote:
>>>> On Thu, Jul 25, 2024 at 08:39:54PM +0200, David Hildenbrand wrote:
>>>>> pte_lockptr() is the only *_lockptr() function that doesn't consume
>>>>> what would be expected: it consumes a pmd_t pointer instead of a pte_t
>>>>> pointer.
>>>>>
>>>>> Let's change that. The two callers in pgtable-generic.c are easily
>>>>> adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
>>>>> pte_offset_map_nolock() to obtain the lock, even though we won't 
>>>>> actually
>>>>> be traversing the page table.
>>>>>
>>>>> This makes the code more similar to the other variants and avoids 
>>>>> other
>>>>> hacks to make the new pte_lockptr() version happy. pte_lockptr() users
>>>>> reside now only in  pgtable-generic.c.
>>>>>
>>>>> Maybe, using pte_offset_map_nolock() is the right thing to do because
>>>>> the PTE table could have been removed in the meantime? At least it 
>>>>> sounds
>>>>> more future proof if we ever have other means of page table reclaim.
>>>>
>>>> I think it can't change, because anyone who wants to race against this
>>>> should try to take the pmd lock first (which was held already)?
>>>
>>> That doesn't explain why it is safe for us to assume that after we 
>>> took the
>>> PMD lock that the PMD actually still points at a completely empty page
>>> table. Likely it currently works by accident, because we only have a 
>>> single
>>> such user that makes this assumption. It might certainly be a 
>>> different once
>>> we asynchronously reclaim page tables.
>>
>> I think it's safe because find_pmd_or_thp_or_none() returned SUCCEED, and
>> we're holding i_mmap lock for read.  I don't see any way that this pmd 
>> can
>> become a non-pgtable-page.
>>
>> I meant, AFAIU tearing down pgtable in whatever sane way will need to at
>> least take both mmap write lock and i_mmap write lock (in this case, a 
>> file
>> mapping), no?
> 
> Skimming over [1] where I still owe a review I think we can now do it 
> now purely under the read locks, with the PMD lock held.

Yes.

> 
> I think this is also what collapse_pte_mapped_thp() ends up doing: 
> replace a PTE table that maps a folio by a PMD (present or none, 
> depends) that maps a folio only while holding the mmap lock in read 
> mode. Of course, here the table is not empty but we need similar ways of 
> making PT walkers aware of concurrent page table retraction.
> 
> IIRC, that was the magic added to __pte_offset_map(), such that 
> pte_offset_map_nolock/pte_offset_map_lock can fail on races.
> 
> 
> But if we hold the PMD lock, nothing should actually change (so far my 
> understanding) -- we cannot suddenly rip out a page table.
> 
> [1] 
> https://lkml.kernel.org/r/cover.1719570849.git.zhengqi.arch@bytedance.com
> 
>>
>>>
>>> But yes, the PMD cannot get modified while we hold the PMD lock, 
>>> otherwise
>>> we'd be in trouble
>>>
>>>>
>>>> I wonder an open coded "ptlock_ptr(page_ptdesc(pmd_page(*pmd)))" 
>>>> would be
>>>> nicer here, but only if my understanding is correct.
>>>
>>> I really don't like open-coding that. Fortunately we were able to 
>>> limit the
>>> use of ptlock_ptr to a single user outside of arch/x86/xen/mmu_pv.c 
>>> so far.
>>
>> I'm fine if you prefer like that; I don't see it a huge deal to me.
> 
> Let's keep it like that, unless we can come up with something neater. At 
> least it makes the code also more consistent with similar code in that 
> file and the overhead should be  minimal.
> 
> I was briefly thinking about actually testing if the PT is full of 
> pte_none(), either as a debugging check or to also handle what is > currently handled via:
> 
> if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
> 
> Seems wasteful just because some part of a VMA might have a private page 
> mapped / uffd-wp active to let all other parts suffer.
> 
> Will think about if that is really worth it.
> 
> ... also because I still want to understand why the PTL of the PMD table 
> is required at all. What if we lock it first and somebody else wants to 
> lock it after us while we already ripped it out? Sure there must be some 
> reason for the lock, I just don't understand it yet :/.

For pmd lock, I think this is needed to clear the pmd entry
(pmdp_collapse_flush()). For pte lock, there should be the following two
reasons:

1. release it after clearing pmd entry, then we can capture the changed
    pmd in pte_offset_map_lock() etc after holding this pte lock.
    (This is also what I did in my patchset)

2. As mentioned in the comments, we may be concurrent with
    userfaultfd_ioctl(), but we do not hold the read lock of mmap (or
    read lock of vma), so the VM_UFFD_WP may be set. Therefore, we need
    to hold the pte lock to check whether a new pte entry has been
    inserted.
    (See commit[1] for more details)

[1]. 
https://github.com/torvalds/linux/commit/a98460494b16db9c377e55bc13e5407a0eb79fe8

Thanks,
Qi

>
Qi Zheng July 29, 2024, 7:48 a.m. UTC | #6
On 2024/7/26 02:39, David Hildenbrand wrote:
> pte_lockptr() is the only *_lockptr() function that doesn't consume
> what would be expected: it consumes a pmd_t pointer instead of a pte_t
> pointer.
> 
> Let's change that. The two callers in pgtable-generic.c are easily
> adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
> pte_offset_map_nolock() to obtain the lock, even though we won't actually
> be traversing the page table.
> 
> This makes the code more similar to the other variants and avoids other
> hacks to make the new pte_lockptr() version happy. pte_lockptr() users
> reside now only in  pgtable-generic.c.
> 
> Maybe, using pte_offset_map_nolock() is the right thing to do because
> the PTE table could have been removed in the meantime? At least it sounds
> more future proof if we ever have other means of page table reclaim.

Agree, this helps us recheck the pmd entry.

> 
> It's not quite clear if holding the PTE table lock is really required:
> what if someone else obtains the lock just after we unlock it? But we'll
> leave that as is for now, maybe there are good reasons.
> 
> This is a preparation for adapting hugetlb page table locking logic to
> take the same locks as core-mm page table walkers would.
> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>   include/linux/mm.h   |  7 ++++---
>   mm/khugepaged.c      | 21 +++++++++++++++------
>   mm/pgtable-generic.c |  4 ++--
>   3 files changed, 21 insertions(+), 11 deletions(-)

Since pte_lockptr() no longer has a pmd parameter, it is best to modify
the comments above __pte_offset_map_lock() as well:

```
This helps the caller to avoid a later pte_lockptr(mm, *pmd), which 
might by that time act on a changed *pmd ...
```

Otherwise:
Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>

> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 2c6ccf088c7be..0472a5090b180 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2873,9 +2873,10 @@ static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
>   }
>   #endif /* ALLOC_SPLIT_PTLOCKS */
>   
> -static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
> +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pte_t *pte)
>   {
> -	return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
> +	/* PTE page tables don't currently exceed a single page. */
> +	return ptlock_ptr(virt_to_ptdesc(pte));
>   }
>   
>   static inline bool ptlock_init(struct ptdesc *ptdesc)
> @@ -2898,7 +2899,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc)
>   /*
>    * We use mm->page_table_lock to guard all pagetable pages of the mm.
>    */
> -static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
> +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pte_t *pte)
>   {
>   	return &mm->page_table_lock;
>   }
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index cdd1d8655a76b..f3b3db1046155 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1697,12 +1697,13 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>   	i_mmap_lock_read(mapping);
>   	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
>   		struct mmu_notifier_range range;
> +		bool retracted = false;
>   		struct mm_struct *mm;
>   		unsigned long addr;
>   		pmd_t *pmd, pgt_pmd;
>   		spinlock_t *pml;
>   		spinlock_t *ptl;
> -		bool skipped_uffd = false;
> +		pte_t *pte;
>   
>   		/*
>   		 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
> @@ -1739,9 +1740,17 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>   		mmu_notifier_invalidate_range_start(&range);
>   
>   		pml = pmd_lock(mm, pmd);
> -		ptl = pte_lockptr(mm, pmd);
> +
> +		/*
> +		 * No need to check the PTE table content, but we'll grab the
> +		 * PTE table lock while we zap it.
> +		 */
> +		pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
> +		if (!pte)
> +			goto unlock_pmd;
>   		if (ptl != pml)
>   			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
> +		pte_unmap(pte);
>   
>   		/*
>   		 * Huge page lock is still held, so normally the page table
> @@ -1752,20 +1761,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>   		 * repeating the anon_vma check protects from one category,
>   		 * and repeating the userfaultfd_wp() check from another.
>   		 */
> -		if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
> -			skipped_uffd = true;
> -		} else {
> +		if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
>   			pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
>   			pmdp_get_lockless_sync();
> +			retracted = true;
>   		}
>   
>   		if (ptl != pml)
>   			spin_unlock(ptl);
> +unlock_pmd:
>   		spin_unlock(pml);
>   
>   		mmu_notifier_invalidate_range_end(&range);
>   
> -		if (!skipped_uffd) {
> +		if (retracted) {
>   			mm_dec_nr_ptes(mm);
>   			page_table_check_pte_clear_range(mm, addr, pgt_pmd);
>   			pte_free_defer(mm, pmd_pgtable(pgt_pmd));
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index a78a4adf711ac..13a7705df3f87 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -313,7 +313,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
>   
>   	pte = __pte_offset_map(pmd, addr, &pmdval);
>   	if (likely(pte))
> -		*ptlp = pte_lockptr(mm, &pmdval);
> +		*ptlp = pte_lockptr(mm, pte);
>   	return pte;
>   }
>   
> @@ -371,7 +371,7 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
>   	pte = __pte_offset_map(pmd, addr, &pmdval);
>   	if (unlikely(!pte))
>   		return pte;
> -	ptl = pte_lockptr(mm, &pmdval);
> +	ptl = pte_lockptr(mm, pte);
>   	spin_lock(ptl);
>   	if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
>   		*ptlp = ptl;
David Hildenbrand July 29, 2024, 8:46 a.m. UTC | #7
On 29.07.24 09:48, Qi Zheng wrote:
> 
> 
> On 2024/7/26 02:39, David Hildenbrand wrote:
>> pte_lockptr() is the only *_lockptr() function that doesn't consume
>> what would be expected: it consumes a pmd_t pointer instead of a pte_t
>> pointer.
>>
>> Let's change that. The two callers in pgtable-generic.c are easily
>> adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
>> pte_offset_map_nolock() to obtain the lock, even though we won't actually
>> be traversing the page table.
>>
>> This makes the code more similar to the other variants and avoids other
>> hacks to make the new pte_lockptr() version happy. pte_lockptr() users
>> reside now only in  pgtable-generic.c.
>>
>> Maybe, using pte_offset_map_nolock() is the right thing to do because
>> the PTE table could have been removed in the meantime? At least it sounds
>> more future proof if we ever have other means of page table reclaim.
> 
> Agree, this helps us recheck the pmd entry.
> 
>>
>> It's not quite clear if holding the PTE table lock is really required:
>> what if someone else obtains the lock just after we unlock it? But we'll
>> leave that as is for now, maybe there are good reasons.
>>
>> This is a preparation for adapting hugetlb page table locking logic to
>> take the same locks as core-mm page table walkers would.
>>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>>    include/linux/mm.h   |  7 ++++---
>>    mm/khugepaged.c      | 21 +++++++++++++++------
>>    mm/pgtable-generic.c |  4 ++--
>>    3 files changed, 21 insertions(+), 11 deletions(-)
> 
> Since pte_lockptr() no longer has a pmd parameter, it is best to modify
> the comments above __pte_offset_map_lock() as well:
> 
> ```
> This helps the caller to avoid a later pte_lockptr(mm, *pmd), which
> might by that time act on a changed *pmd ...
> ```

Right, thanks a lot for the review!

The following on top;


 From a46b16aa9bfa02ffb425d364d7f00129a8e803ad Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jul 2024 10:43:34 +0200
Subject: [PATCH] fixup: mm: let pte_lockptr() consume a pte_t pointer

Let's adjust the comment, passing a pte to pte_lockptr() and dropping
a detail about changed *pmd, which no longer applies.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
  mm/pgtable-generic.c | 10 +++++-----
  1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 13a7705df3f87..f17465b43d344 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -350,11 +350,11 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
   * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
   * but when successful, it also outputs a pointer to the spinlock in ptlp - as
   * pte_offset_map_lock() does, but in this case without locking it.  This helps
- * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
- * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock
- * pointer for the page table that it returns.  In principle, the caller should
- * recheck *pmd once the lock is taken; in practice, no callsite needs that -
- * either the mmap_lock for write, or pte_same() check on contents, is enough.
+ * the caller to avoid a later pte_lockptr(mm, pte): pte_offset_map_nolock()
+ * provides the correct spinlock pointer for the page table that it returns.
+ * In principle, the caller should recheck *pmd once the lock is taken; in
+ * practice, no callsite needs that - either the mmap_lock for write, or
+ * pte_same() check on contents, is enough.
   *
   * Note that free_pgtables(), used after unmapping detached vmas, or when
   * exiting the whole mm, does not take page table lock before freeing a page
Qi Zheng July 29, 2024, 8:52 a.m. UTC | #8
On 2024/7/29 16:46, David Hildenbrand wrote:
> On 29.07.24 09:48, Qi Zheng wrote:
>>
>>
>> On 2024/7/26 02:39, David Hildenbrand wrote:
>>> pte_lockptr() is the only *_lockptr() function that doesn't consume
>>> what would be expected: it consumes a pmd_t pointer instead of a pte_t
>>> pointer.
>>>
>>> Let's change that. The two callers in pgtable-generic.c are easily
>>> adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
>>> pte_offset_map_nolock() to obtain the lock, even though we won't 
>>> actually
>>> be traversing the page table.
>>>
>>> This makes the code more similar to the other variants and avoids other
>>> hacks to make the new pte_lockptr() version happy. pte_lockptr() users
>>> reside now only in  pgtable-generic.c.
>>>
>>> Maybe, using pte_offset_map_nolock() is the right thing to do because
>>> the PTE table could have been removed in the meantime? At least it 
>>> sounds
>>> more future proof if we ever have other means of page table reclaim.
>>
>> Agree, this helps us recheck the pmd entry.
>>
>>>
>>> It's not quite clear if holding the PTE table lock is really required:
>>> what if someone else obtains the lock just after we unlock it? But we'll
>>> leave that as is for now, maybe there are good reasons.
>>>
>>> This is a preparation for adapting hugetlb page table locking logic to
>>> take the same locks as core-mm page table walkers would.
>>>
>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>> ---
>>>    include/linux/mm.h   |  7 ++++---
>>>    mm/khugepaged.c      | 21 +++++++++++++++------
>>>    mm/pgtable-generic.c |  4 ++--
>>>    3 files changed, 21 insertions(+), 11 deletions(-)
>>
>> Since pte_lockptr() no longer has a pmd parameter, it is best to modify
>> the comments above __pte_offset_map_lock() as well:
>>
>> ```
>> This helps the caller to avoid a later pte_lockptr(mm, *pmd), which
>> might by that time act on a changed *pmd ...
>> ```
> 
> Right, thanks a lot for the review!
> 
> The following on top;
> 
> 
>  From a46b16aa9bfa02ffb425d364d7f00129a8e803ad Mon Sep 17 00:00:00 2001
> From: David Hildenbrand <david@redhat.com>
> Date: Mon, 29 Jul 2024 10:43:34 +0200
> Subject: [PATCH] fixup: mm: let pte_lockptr() consume a pte_t pointer
> 
> Let's adjust the comment, passing a pte to pte_lockptr() and dropping
> a detail about changed *pmd, which no longer applies.
> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>   mm/pgtable-generic.c | 10 +++++-----
>   1 file changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index 13a7705df3f87..f17465b43d344 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -350,11 +350,11 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, 
> pmd_t *pmd,
>    * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like 
> pte_offset_map();
>    * but when successful, it also outputs a pointer to the spinlock in 
> ptlp - as
>    * pte_offset_map_lock() does, but in this case without locking it.  
> This helps
> - * the caller to avoid a later pte_lockptr(mm, *pmd), which might by 
> that time
> - * act on a changed *pmd: pte_offset_map_nolock() provides the correct 
> spinlock
> - * pointer for the page table that it returns.  In principle, the 
> caller should
> - * recheck *pmd once the lock is taken; in practice, no callsite needs 
> that -
> - * either the mmap_lock for write, or pte_same() check on contents, is 
> enough.
> + * the caller to avoid a later pte_lockptr(mm, pte): 
> pte_offset_map_nolock()
> + * provides the correct spinlock pointer for the page table that it 
> returns.
> + * In principle, the caller should recheck *pmd once the lock is taken; in
> + * practice, no callsite needs that - either the mmap_lock for write, or
> + * pte_same() check on contents, is enough.
>    *
>    * Note that free_pgtables(), used after unmapping detached vmas, or when
>    * exiting the whole mm, does not take page table lock before freeing 
> a page

LGTM. Thanks!
Peter Xu July 29, 2024, 4:26 p.m. UTC | #9
On Fri, Jul 26, 2024 at 11:48:01PM +0200, David Hildenbrand wrote:
> On 26.07.24 23:28, Peter Xu wrote:
> > On Fri, Jul 26, 2024 at 06:02:17PM +0200, David Hildenbrand wrote:
> > > On 26.07.24 17:36, Peter Xu wrote:
> > > > On Thu, Jul 25, 2024 at 08:39:54PM +0200, David Hildenbrand wrote:
> > > > > pte_lockptr() is the only *_lockptr() function that doesn't consume
> > > > > what would be expected: it consumes a pmd_t pointer instead of a pte_t
> > > > > pointer.
> > > > > 
> > > > > Let's change that. The two callers in pgtable-generic.c are easily
> > > > > adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
> > > > > pte_offset_map_nolock() to obtain the lock, even though we won't actually
> > > > > be traversing the page table.
> > > > > 
> > > > > This makes the code more similar to the other variants and avoids other
> > > > > hacks to make the new pte_lockptr() version happy. pte_lockptr() users
> > > > > reside now only in  pgtable-generic.c.
> > > > > 
> > > > > Maybe, using pte_offset_map_nolock() is the right thing to do because
> > > > > the PTE table could have been removed in the meantime? At least it sounds
> > > > > more future proof if we ever have other means of page table reclaim.
> > > > 
> > > > I think it can't change, because anyone who wants to race against this
> > > > should try to take the pmd lock first (which was held already)?
> > > 
> > > That doesn't explain why it is safe for us to assume that after we took the
> > > PMD lock that the PMD actually still points at a completely empty page
> > > table. Likely it currently works by accident, because we only have a single
> > > such user that makes this assumption. It might certainly be a different once
> > > we asynchronously reclaim page tables.
> > 
> > I think it's safe because find_pmd_or_thp_or_none() returned SUCCEED, and
> > we're holding i_mmap lock for read.  I don't see any way that this pmd can
> > become a non-pgtable-page.
> > 
> > I meant, AFAIU tearing down pgtable in whatever sane way will need to at
> > least take both mmap write lock and i_mmap write lock (in this case, a file
> > mapping), no?
> 
> Skimming over [1] where I still owe a review I think we can now do it now
> purely under the read locks, with the PMD lock held.

Err, how I missed that.. yeah you're definitely right, and that's the
context here where we're collapsing.  I think I somehow forgot all Hugh's
work when I replied there, sorry.

> 
> I think this is also what collapse_pte_mapped_thp() ends up doing: replace a
> PTE table that maps a folio by a PMD (present or none, depends) that maps a
> folio only while holding the mmap lock in read mode. Of course, here the
> table is not empty but we need similar ways of making PT walkers aware of
> concurrent page table retraction.
> 
> IIRC, that was the magic added to __pte_offset_map(), such that
> pte_offset_map_nolock/pte_offset_map_lock can fail on races.

Said that, I still think current code (before this patch) is safe, same to
a hard-coded line to lock the pte pgtable lock.  Again, I'm fine if you
prefer pte_offset_map_nolock() but I just think the rcu read lock and stuff
can be avoided.

I think it's because such collapse so far can only happen in such path
where we need to hold the large folio (PMD-level) lock first.  It means
anyone who could change this pmd entry to be not a pte pgtable is blocked
already, hence it must keeping being a pte pgtable page even if we don't
take any rcu.

> 
> 
> But if we hold the PMD lock, nothing should actually change (so far my
> understanding) -- we cannot suddenly rip out a page table.
> 
> [1]
> https://lkml.kernel.org/r/cover.1719570849.git.zhengqi.arch@bytedance.com
> 
> > 
> > > 
> > > But yes, the PMD cannot get modified while we hold the PMD lock, otherwise
> > > we'd be in trouble
> > > 
> > > > 
> > > > I wonder an open coded "ptlock_ptr(page_ptdesc(pmd_page(*pmd)))" would be
> > > > nicer here, but only if my understanding is correct.
> > > 
> > > I really don't like open-coding that. Fortunately we were able to limit the
> > > use of ptlock_ptr to a single user outside of arch/x86/xen/mmu_pv.c so far.
> > 
> > I'm fine if you prefer like that; I don't see it a huge deal to me.
> 
> Let's keep it like that, unless we can come up with something neater. At
> least it makes the code also more consistent with similar code in that file
> and the overhead should be  minimal.
> 
> I was briefly thinking about actually testing if the PT is full of
> pte_none(), either as a debugging check or to also handle what is currently
> handled via:
> 
> if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
> 
> Seems wasteful just because some part of a VMA might have a private page
> mapped / uffd-wp active to let all other parts suffer.
> 
> Will think about if that is really worth it.
> 
> ... also because I still want to understand why the PTL of the PMD table is
> required at all. What if we lock it first and somebody else wants to lock it
> after us while we already ripped it out? Sure there must be some reason for
> the lock, I just don't understand it yet :/.

IIUC the pte pgtable lock will be needed for checking anon_vma safely.

e.g., consider if we don't take the pte pgtable lock, I think it's possible
some thread tries to inject a private pte (and prepare anon_vma before
that) concurrently with this thread trying to collapse the pgtable into a
huge pmd.  I mean, when without the pte pgtable lock held, I think it's
racy to check this line:

        if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
                ...
        }

On the 1st condition.

Thanks,
Peter Xu July 29, 2024, 4:39 p.m. UTC | #10
On Mon, Jul 29, 2024 at 12:26:00PM -0400, Peter Xu wrote:
> On Fri, Jul 26, 2024 at 11:48:01PM +0200, David Hildenbrand wrote:
> > On 26.07.24 23:28, Peter Xu wrote:
> > > On Fri, Jul 26, 2024 at 06:02:17PM +0200, David Hildenbrand wrote:
> > > > On 26.07.24 17:36, Peter Xu wrote:
> > > > > On Thu, Jul 25, 2024 at 08:39:54PM +0200, David Hildenbrand wrote:
> > > > > > pte_lockptr() is the only *_lockptr() function that doesn't consume
> > > > > > what would be expected: it consumes a pmd_t pointer instead of a pte_t
> > > > > > pointer.
> > > > > > 
> > > > > > Let's change that. The two callers in pgtable-generic.c are easily
> > > > > > adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
> > > > > > pte_offset_map_nolock() to obtain the lock, even though we won't actually
> > > > > > be traversing the page table.
> > > > > > 
> > > > > > This makes the code more similar to the other variants and avoids other
> > > > > > hacks to make the new pte_lockptr() version happy. pte_lockptr() users
> > > > > > reside now only in  pgtable-generic.c.
> > > > > > 
> > > > > > Maybe, using pte_offset_map_nolock() is the right thing to do because
> > > > > > the PTE table could have been removed in the meantime? At least it sounds
> > > > > > more future proof if we ever have other means of page table reclaim.
> > > > > 
> > > > > I think it can't change, because anyone who wants to race against this
> > > > > should try to take the pmd lock first (which was held already)?
> > > > 
> > > > That doesn't explain why it is safe for us to assume that after we took the
> > > > PMD lock that the PMD actually still points at a completely empty page
> > > > table. Likely it currently works by accident, because we only have a single
> > > > such user that makes this assumption. It might certainly be a different once
> > > > we asynchronously reclaim page tables.
> > > 
> > > I think it's safe because find_pmd_or_thp_or_none() returned SUCCEED, and
> > > we're holding i_mmap lock for read.  I don't see any way that this pmd can
> > > become a non-pgtable-page.
> > > 
> > > I meant, AFAIU tearing down pgtable in whatever sane way will need to at
> > > least take both mmap write lock and i_mmap write lock (in this case, a file
> > > mapping), no?
> > 
> > Skimming over [1] where I still owe a review I think we can now do it now
> > purely under the read locks, with the PMD lock held.
> 
> Err, how I missed that.. yeah you're definitely right, and that's the
> context here where we're collapsing.  I think I somehow forgot all Hugh's
> work when I replied there, sorry.
> 
> > 
> > I think this is also what collapse_pte_mapped_thp() ends up doing: replace a
> > PTE table that maps a folio by a PMD (present or none, depends) that maps a
> > folio only while holding the mmap lock in read mode. Of course, here the
> > table is not empty but we need similar ways of making PT walkers aware of
> > concurrent page table retraction.
> > 
> > IIRC, that was the magic added to __pte_offset_map(), such that
> > pte_offset_map_nolock/pte_offset_map_lock can fail on races.
> 
> Said that, I still think current code (before this patch) is safe, same to
> a hard-coded line to lock the pte pgtable lock.  Again, I'm fine if you
> prefer pte_offset_map_nolock() but I just think the rcu read lock and stuff
> can be avoided.
> 
> I think it's because such collapse so far can only happen in such path
> where we need to hold the large folio (PMD-level) lock first.  It means
> anyone who could change this pmd entry to be not a pte pgtable is blocked
> already, hence it must keeping being a pte pgtable page even if we don't
> take any rcu.
> 
> > 
> > 
> > But if we hold the PMD lock, nothing should actually change (so far my
> > understanding) -- we cannot suddenly rip out a page table.
> > 
> > [1]
> > https://lkml.kernel.org/r/cover.1719570849.git.zhengqi.arch@bytedance.com
> > 
> > > 
> > > > 
> > > > But yes, the PMD cannot get modified while we hold the PMD lock, otherwise
> > > > we'd be in trouble
> > > > 
> > > > > 
> > > > > I wonder an open coded "ptlock_ptr(page_ptdesc(pmd_page(*pmd)))" would be
> > > > > nicer here, but only if my understanding is correct.
> > > > 
> > > > I really don't like open-coding that. Fortunately we were able to limit the
> > > > use of ptlock_ptr to a single user outside of arch/x86/xen/mmu_pv.c so far.
> > > 
> > > I'm fine if you prefer like that; I don't see it a huge deal to me.
> > 
> > Let's keep it like that, unless we can come up with something neater. At
> > least it makes the code also more consistent with similar code in that file
> > and the overhead should be  minimal.
> > 
> > I was briefly thinking about actually testing if the PT is full of
> > pte_none(), either as a debugging check or to also handle what is currently
> > handled via:
> > 
> > if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
> > 
> > Seems wasteful just because some part of a VMA might have a private page
> > mapped / uffd-wp active to let all other parts suffer.
> > 
> > Will think about if that is really worth it.
> > 
> > ... also because I still want to understand why the PTL of the PMD table is
> > required at all. What if we lock it first and somebody else wants to lock it
> > after us while we already ripped it out? Sure there must be some reason for
> > the lock, I just don't understand it yet :/.
> 
> IIUC the pte pgtable lock will be needed for checking anon_vma safely.
> 
> e.g., consider if we don't take the pte pgtable lock, I think it's possible
> some thread tries to inject a private pte (and prepare anon_vma before
> that) concurrently with this thread trying to collapse the pgtable into a
> huge pmd.  I mean, when without the pte pgtable lock held, I think it's
> racy to check this line:
> 
>         if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
>                 ...
>         }
> 
> On the 1st condition.

Hmm, right after I replied, I think it also guarantees safety on the 2nd
condition..

Note that one thing I still prefer a hard-coded line over
pte_offset_map_nolock() is that, the new code seems to say we can treat the
pte pgtable page differently from the pte pgtable lock.  But I think
they're really in the same realm.

In short, AFAIU the rcu lock not only protects the pte pgtable's existance,
but also protects the pte lock.

From that POV, below new code in this patch:

-               ptl = pte_lockptr(mm, pmd);
+
+               /*
+                * No need to check the PTE table content, but we'll grab the
+                * PTE table lock while we zap it.
+                */
+               pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
+               if (!pte)
+                       goto unlock_pmd;
                if (ptl != pml)
                        spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+               pte_unmap(pte);

Could be very misleading, where it seems to say that it's fine to use the
pte pgtable lock even after rcu unlock.  It might make the code harder to
understand.

Thanks,
David Hildenbrand July 29, 2024, 5:46 p.m. UTC | #11
On 29.07.24 18:39, Peter Xu wrote:
> On Mon, Jul 29, 2024 at 12:26:00PM -0400, Peter Xu wrote:
>> On Fri, Jul 26, 2024 at 11:48:01PM +0200, David Hildenbrand wrote:
>>> On 26.07.24 23:28, Peter Xu wrote:
>>>> On Fri, Jul 26, 2024 at 06:02:17PM +0200, David Hildenbrand wrote:
>>>>> On 26.07.24 17:36, Peter Xu wrote:
>>>>>> On Thu, Jul 25, 2024 at 08:39:54PM +0200, David Hildenbrand wrote:
>>>>>>> pte_lockptr() is the only *_lockptr() function that doesn't consume
>>>>>>> what would be expected: it consumes a pmd_t pointer instead of a pte_t
>>>>>>> pointer.
>>>>>>>
>>>>>>> Let's change that. The two callers in pgtable-generic.c are easily
>>>>>>> adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
>>>>>>> pte_offset_map_nolock() to obtain the lock, even though we won't actually
>>>>>>> be traversing the page table.
>>>>>>>
>>>>>>> This makes the code more similar to the other variants and avoids other
>>>>>>> hacks to make the new pte_lockptr() version happy. pte_lockptr() users
>>>>>>> reside now only in  pgtable-generic.c.
>>>>>>>
>>>>>>> Maybe, using pte_offset_map_nolock() is the right thing to do because
>>>>>>> the PTE table could have been removed in the meantime? At least it sounds
>>>>>>> more future proof if we ever have other means of page table reclaim.
>>>>>>
>>>>>> I think it can't change, because anyone who wants to race against this
>>>>>> should try to take the pmd lock first (which was held already)?
>>>>>
>>>>> That doesn't explain why it is safe for us to assume that after we took the
>>>>> PMD lock that the PMD actually still points at a completely empty page
>>>>> table. Likely it currently works by accident, because we only have a single
>>>>> such user that makes this assumption. It might certainly be a different once
>>>>> we asynchronously reclaim page tables.
>>>>
>>>> I think it's safe because find_pmd_or_thp_or_none() returned SUCCEED, and
>>>> we're holding i_mmap lock for read.  I don't see any way that this pmd can
>>>> become a non-pgtable-page.
>>>>
>>>> I meant, AFAIU tearing down pgtable in whatever sane way will need to at
>>>> least take both mmap write lock and i_mmap write lock (in this case, a file
>>>> mapping), no?
>>>
>>> Skimming over [1] where I still owe a review I think we can now do it now
>>> purely under the read locks, with the PMD lock held.
>>
>> Err, how I missed that.. yeah you're definitely right, and that's the
>> context here where we're collapsing.  I think I somehow forgot all Hugh's
>> work when I replied there, sorry.
>>
>>>
>>> I think this is also what collapse_pte_mapped_thp() ends up doing: replace a
>>> PTE table that maps a folio by a PMD (present or none, depends) that maps a
>>> folio only while holding the mmap lock in read mode. Of course, here the
>>> table is not empty but we need similar ways of making PT walkers aware of
>>> concurrent page table retraction.
>>>
>>> IIRC, that was the magic added to __pte_offset_map(), such that
>>> pte_offset_map_nolock/pte_offset_map_lock can fail on races.
>>
>> Said that, I still think current code (before this patch) is safe, same to
>> a hard-coded line to lock the pte pgtable lock.  Again, I'm fine if you
>> prefer pte_offset_map_nolock() but I just think the rcu read lock and stuff
>> can be avoided.
>>
>> I think it's because such collapse so far can only happen in such path
>> where we need to hold the large folio (PMD-level) lock first.  It means
>> anyone who could change this pmd entry to be not a pte pgtable is blocked
>> already, hence it must keeping being a pte pgtable page even if we don't
>> take any rcu.
>>
>>>
>>>
>>> But if we hold the PMD lock, nothing should actually change (so far my
>>> understanding) -- we cannot suddenly rip out a page table.
>>>
>>> [1]
>>> https://lkml.kernel.org/r/cover.1719570849.git.zhengqi.arch@bytedance.com
>>>
>>>>
>>>>>
>>>>> But yes, the PMD cannot get modified while we hold the PMD lock, otherwise
>>>>> we'd be in trouble
>>>>>
>>>>>>
>>>>>> I wonder an open coded "ptlock_ptr(page_ptdesc(pmd_page(*pmd)))" would be
>>>>>> nicer here, but only if my understanding is correct.
>>>>>
>>>>> I really don't like open-coding that. Fortunately we were able to limit the
>>>>> use of ptlock_ptr to a single user outside of arch/x86/xen/mmu_pv.c so far.
>>>>
>>>> I'm fine if you prefer like that; I don't see it a huge deal to me.
>>>
>>> Let's keep it like that, unless we can come up with something neater. At
>>> least it makes the code also more consistent with similar code in that file
>>> and the overhead should be  minimal.
>>>
>>> I was briefly thinking about actually testing if the PT is full of
>>> pte_none(), either as a debugging check or to also handle what is currently
>>> handled via:
>>>
>>> if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
>>>
>>> Seems wasteful just because some part of a VMA might have a private page
>>> mapped / uffd-wp active to let all other parts suffer.
>>>
>>> Will think about if that is really worth it.
>>>
>>> ... also because I still want to understand why the PTL of the PMD table is
>>> required at all. What if we lock it first and somebody else wants to lock it
>>> after us while we already ripped it out? Sure there must be some reason for
>>> the lock, I just don't understand it yet :/.
>>
>> IIUC the pte pgtable lock will be needed for checking anon_vma safely.
>>
>> e.g., consider if we don't take the pte pgtable lock, I think it's possible
>> some thread tries to inject a private pte (and prepare anon_vma before
>> that) concurrently with this thread trying to collapse the pgtable into a
>> huge pmd.  I mean, when without the pte pgtable lock held, I think it's
>> racy to check this line:
>>
>>          if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
>>                  ...
>>          }
>>
>> On the 1st condition.
> 
> Hmm, right after I replied, I think it also guarantees safety on the 2nd
> condition..
> 
> Note that one thing I still prefer a hard-coded line over
> pte_offset_map_nolock() is that, the new code seems to say we can treat the
> pte pgtable page differently from the pte pgtable lock.  But I think
> they're really in the same realm.
> 
> In short, AFAIU the rcu lock not only protects the pte pgtable's existance,
> but also protects the pte lock.
> 
>  From that POV, below new code in this patch:
> 
> -               ptl = pte_lockptr(mm, pmd);
> +
> +               /*
> +                * No need to check the PTE table content, but we'll grab the
> +                * PTE table lock while we zap it.
> +                */
> +               pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
> +               if (!pte)
> +                       goto unlock_pmd;
>                  if (ptl != pml)
>                          spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
> +               pte_unmap(pte);
> 
> Could be very misleading, where it seems to say that it's fine to use the
> pte pgtable lock even after rcu unlock.  It might make the code harder to
> understand.

I see what you mean but this is a very similar pattern as used in 
collapse_pte_mapped_thp(), no? There we have

start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
...
if (!pml)
	spin_lock(ptl);
...
pte_unmap(start_pte);
if (!pml)
	spin_unlock(ptl);


Again, I don't have a strong opinion on this, but doing it more similar 
to collapse_pte_mapped_thp() to obtain locks makes it clearer to me. But 
if I am missing something obvious please shout and I'll change it.
David Hildenbrand July 30, 2024, 8:40 a.m. UTC | #12
>> ... also because I still want to understand why the PTL of the PMD table
>> is required at all. What if we lock it first and somebody else wants to
>> lock it after us while we already ripped it out? Sure there must be some
>> reason for the lock, I just don't understand it yet :/.
> 
> For pmd lock, I think this is needed to clear the pmd entry
> (pmdp_collapse_flush()). For pte lock, there should be the following two
> reasons:
> 

Thanks for the details.

My current understanding correct that removing *empty* page tables is
currently only possible if we can guarantee that nothing would try modifying the
page tables after we drop the PTE page table lock, but we would be happy if someone
would walk an empty page table while we remove it as long as the access is read-only.

In retract_page_tables() I thought that would be guaranteed as we prevent refaults
from the page cache and exclude any uffd + anon handling using the big hammer
(if any could be active, disallow zapping the page table).

What I am still not quite getting is what happens if someone would grab the PTE page
table lock after we released it -- and how that really protects us here.


I assume it's the

spin_lock(ptl);
if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
...

handling in __pte_offset_map_lock() that guarantees that.


That indeed means that pte_offset_map_nolock() requires similar checks after
obtaining the PTL (for the cases where we are not holding the PMD table lock
and can be sure that we are the one ripping out that table right now).


> 1. release it after clearing pmd entry, then we can capture the changed
>      pmd in pte_offset_map_lock() etc after holding this pte lock.
>      (This is also what I did in my patchset)

Yes, I get it now.

> 
> 2. As mentioned in the comments, we may be concurrent with
>      userfaultfd_ioctl(), but we do not hold the read lock of mmap (or
>      read lock of vma), so the VM_UFFD_WP may be set. Therefore, we need
>      to hold the pte lock to check whether a new pte entry has been
>      inserted.
>      (See commit[1] for more details)


Yes, I see we tried to document that magic and it is still absolutely confusing :)

But at least now it's clearer to me why retract_page_tables() uses slightly different
locking than collapse_pte_mapped_thp().

Maybe we should look into letting collapse_pte_mapped_thp() use a similar approach as
retract_page_tables() to at least make it more consistent. That topic is also touched
in a98460494b16.
Qi Zheng July 30, 2024, 9:10 a.m. UTC | #13
Hi David,

On 2024/7/30 16:40, David Hildenbrand wrote:
>>> ... also because I still want to understand why the PTL of the PMD table
>>> is required at all. What if we lock it first and somebody else wants to
>>> lock it after us while we already ripped it out? Sure there must be some
>>> reason for the lock, I just don't understand it yet :/.
>>
>> For pmd lock, I think this is needed to clear the pmd entry
>> (pmdp_collapse_flush()). For pte lock, there should be the following two
>> reasons:
>>
> 
> Thanks for the details.
> 
> My current understanding correct that removing *empty* page tables is
> currently only possible if we can guarantee that nothing would try 
> modifying the
> page tables after we drop the PTE page table lock, but we would be happy 
> if someone
> would walk an empty page table while we remove it as long as the access 
> is read-only.
> 
> In retract_page_tables() I thought that would be guaranteed as we 
> prevent refaults
> from the page cache and exclude any uffd + anon handling using the big 
> hammer
> (if any could be active, disallow zapping the page table).
> 
> What I am still not quite getting is what happens if someone would grab 
> the PTE page
> table lock after we released it -- and how that really protects us here.
> 
> 
> I assume it's the
> 
> spin_lock(ptl);
> if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
> ...
> 
> handling in __pte_offset_map_lock() that guarantees that.

Yes.

> 
> 
> That indeed means that pte_offset_map_nolock() requires similar checks 
> after
> obtaining the PTL (for the cases where we are not holding the PMD table 
> lock
> and can be sure that we are the one ripping out that table right now).

Yes, if the PTE page will be freed concurrently, then the pmd entry
needs to be rechecked after acquiring the PTL. So I made
pte_offset_map_nolock() return pmdval in my patch[1], and then we
can do the following:

start_pte = pte_offset_map_nolock(mm, pmd, &pmdval, addr, &ptl);
spin_lock(ptl)
if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
...


[1]. 
https://lore.kernel.org/lkml/7f5233f9f612c7f58abf218852fb1042d764940b.1719570849.git.zhengqi.arch@bytedance.com/

> 
> 
>> 1. release it after clearing pmd entry, then we can capture the changed
>>      pmd in pte_offset_map_lock() etc after holding this pte lock.
>>      (This is also what I did in my patchset)
> 
> Yes, I get it now.
> 
>>
>> 2. As mentioned in the comments, we may be concurrent with
>>      userfaultfd_ioctl(), but we do not hold the read lock of mmap (or
>>      read lock of vma), so the VM_UFFD_WP may be set. Therefore, we need
>>      to hold the pte lock to check whether a new pte entry has been
>>      inserted.
>>      (See commit[1] for more details)
> 
> 
> Yes, I see we tried to document that magic and it is still absolutely 
> confusing :)
> 
> But at least now it's clearer to me why retract_page_tables() uses 
> slightly different
> locking than collapse_pte_mapped_thp().
> 
> Maybe we should look into letting collapse_pte_mapped_thp() use a 
> similar approach as
> retract_page_tables() to at least make it more consistent. That topic is 
> also touched
> in a98460494b16.

Agree, more consistency is better.

Thanks,
Qi

>
Marek Szyprowski July 30, 2024, 3:30 p.m. UTC | #14
On 25.07.2024 20:39, David Hildenbrand wrote:
> pte_lockptr() is the only *_lockptr() function that doesn't consume
> what would be expected: it consumes a pmd_t pointer instead of a pte_t
> pointer.
>
> Let's change that. The two callers in pgtable-generic.c are easily
> adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
> pte_offset_map_nolock() to obtain the lock, even though we won't actually
> be traversing the page table.
>
> This makes the code more similar to the other variants and avoids other
> hacks to make the new pte_lockptr() version happy. pte_lockptr() users
> reside now only in  pgtable-generic.c.
>
> Maybe, using pte_offset_map_nolock() is the right thing to do because
> the PTE table could have been removed in the meantime? At least it sounds
> more future proof if we ever have other means of page table reclaim.
>
> It's not quite clear if holding the PTE table lock is really required:
> what if someone else obtains the lock just after we unlock it? But we'll
> leave that as is for now, maybe there are good reasons.
>
> This is a preparation for adapting hugetlb page table locking logic to
> take the same locks as core-mm page table walkers would.
>
> Signed-off-by: David Hildenbrand <david@redhat.com>

This patch landed in today's linux-next as commit e98970a1d2d4 ("mm: let 
pte_lockptr() consume a pte_t pointer"). Unfortunately it causes the 
following issue on most of my ARM 32bit based test boards:

Run /sbin/init as init process
8<--- cut here ---
Unable to handle kernel NULL pointer dereference at virtual address 
00000010 when read
[00000010] *pgd=00000000
Internal error: Oops: 5 [#1] PREEMPT SMP ARM
Modules linked in:
CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 
6.10.0-12930-ge98970a1d2d4 #15137
Hardware name: Samsung Exynos (Flattened Device Tree)
PC is at __lock_acquire+0x20/0x1604
LR is at lock_acquire+0x21c/0x394
pc : [<c01990e4>]    lr : [<c019b2e8>]    psr: 20000093
sp : f082dc80  ip : 00000001  fp : c1d20000
r10: 00000000  r9 : c13096c8  r8 : 00000001
r7 : 00000000  r6 : c1d20000  r5 : c1278594  r4 : c1278594
r3 : 00000000  r2 : 00000000  r1 : 00000000  r0 : 00000010
Flags: nzCv  IRQs off  FIQs on  Mode SVC_32  ISA ARM  Segment none
Control: 10c5387d  Table: 4000404a  DAC: 00000051
...
Process swapper/0 (pid: 1, stack limit = 0x(ptrval))
Stack: (0xf082dc80 to 0xf082e000)
...
Call trace:
  __lock_acquire from lock_acquire+0x21c/0x394
  lock_acquire from _raw_spin_lock+0x3c/0x4c
  _raw_spin_lock from __pte_offset_map_lock+0x64/0x118
  __pte_offset_map_lock from handle_mm_fault+0xee0/0x13c8
  handle_mm_fault from __get_user_pages+0x18c/0x848
  __get_user_pages from get_user_pages_remote+0xdc/0x474
  get_user_pages_remote from get_arg_page+0x90/0x2a4
  get_arg_page from copy_string_kernel+0x14c/0x184
  copy_string_kernel from kernel_execve+0x94/0x174
  kernel_execve from try_to_run_init_process+0xc/0x3c
  try_to_run_init_process from kernel_init+0xcc/0x12c
  kernel_init from ret_from_fork+0x14/0x28
Exception stack(0xf082dfb0 to 0xf082dff8)
...
---[ end trace 0000000000000000 ]---
note: swapper/0[1] exited with irqs disabled
note: swapper/0[1] exited with preempt_count 1
Kernel panic - not syncing: Attempted to kill init! exitcode=0x0000000b
---[ end Kernel panic - not syncing: Attempted to kill init! 
exitcode=0x0000000b ]---

Reverting $subject on top of next-20240730 fixes/hides this issue.


> ---
>   include/linux/mm.h   |  7 ++++---
>   mm/khugepaged.c      | 21 +++++++++++++++------
>   mm/pgtable-generic.c |  4 ++--
>   3 files changed, 21 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 2c6ccf088c7be..0472a5090b180 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2873,9 +2873,10 @@ static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
>   }
>   #endif /* ALLOC_SPLIT_PTLOCKS */
>   
> -static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
> +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pte_t *pte)
>   {
> -	return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
> +	/* PTE page tables don't currently exceed a single page. */
> +	return ptlock_ptr(virt_to_ptdesc(pte));
>   }
>   
>   static inline bool ptlock_init(struct ptdesc *ptdesc)
> @@ -2898,7 +2899,7 @@ static inline bool ptlock_init(struct ptdesc *ptdesc)
>   /*
>    * We use mm->page_table_lock to guard all pagetable pages of the mm.
>    */
> -static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
> +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pte_t *pte)
>   {
>   	return &mm->page_table_lock;
>   }
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index cdd1d8655a76b..f3b3db1046155 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1697,12 +1697,13 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>   	i_mmap_lock_read(mapping);
>   	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
>   		struct mmu_notifier_range range;
> +		bool retracted = false;
>   		struct mm_struct *mm;
>   		unsigned long addr;
>   		pmd_t *pmd, pgt_pmd;
>   		spinlock_t *pml;
>   		spinlock_t *ptl;
> -		bool skipped_uffd = false;
> +		pte_t *pte;
>   
>   		/*
>   		 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
> @@ -1739,9 +1740,17 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>   		mmu_notifier_invalidate_range_start(&range);
>   
>   		pml = pmd_lock(mm, pmd);
> -		ptl = pte_lockptr(mm, pmd);
> +
> +		/*
> +		 * No need to check the PTE table content, but we'll grab the
> +		 * PTE table lock while we zap it.
> +		 */
> +		pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
> +		if (!pte)
> +			goto unlock_pmd;
>   		if (ptl != pml)
>   			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
> +		pte_unmap(pte);
>   
>   		/*
>   		 * Huge page lock is still held, so normally the page table
> @@ -1752,20 +1761,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
>   		 * repeating the anon_vma check protects from one category,
>   		 * and repeating the userfaultfd_wp() check from another.
>   		 */
> -		if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
> -			skipped_uffd = true;
> -		} else {
> +		if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
>   			pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
>   			pmdp_get_lockless_sync();
> +			retracted = true;
>   		}
>   
>   		if (ptl != pml)
>   			spin_unlock(ptl);
> +unlock_pmd:
>   		spin_unlock(pml);
>   
>   		mmu_notifier_invalidate_range_end(&range);
>   
> -		if (!skipped_uffd) {
> +		if (retracted) {
>   			mm_dec_nr_ptes(mm);
>   			page_table_check_pte_clear_range(mm, addr, pgt_pmd);
>   			pte_free_defer(mm, pmd_pgtable(pgt_pmd));
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index a78a4adf711ac..13a7705df3f87 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -313,7 +313,7 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
>   
>   	pte = __pte_offset_map(pmd, addr, &pmdval);
>   	if (likely(pte))
> -		*ptlp = pte_lockptr(mm, &pmdval);
> +		*ptlp = pte_lockptr(mm, pte);
>   	return pte;
>   }
>   
> @@ -371,7 +371,7 @@ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
>   	pte = __pte_offset_map(pmd, addr, &pmdval);
>   	if (unlikely(!pte))
>   		return pte;
> -	ptl = pte_lockptr(mm, &pmdval);
> +	ptl = pte_lockptr(mm, pte);
>   	spin_lock(ptl);
>   	if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
>   		*ptlp = ptl;

Best regards
David Hildenbrand July 30, 2024, 3:45 p.m. UTC | #15
On 30.07.24 17:30, Marek Szyprowski wrote:
> On 25.07.2024 20:39, David Hildenbrand wrote:
>> pte_lockptr() is the only *_lockptr() function that doesn't consume
>> what would be expected: it consumes a pmd_t pointer instead of a pte_t
>> pointer.
>>
>> Let's change that. The two callers in pgtable-generic.c are easily
>> adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
>> pte_offset_map_nolock() to obtain the lock, even though we won't actually
>> be traversing the page table.
>>
>> This makes the code more similar to the other variants and avoids other
>> hacks to make the new pte_lockptr() version happy. pte_lockptr() users
>> reside now only in  pgtable-generic.c.
>>
>> Maybe, using pte_offset_map_nolock() is the right thing to do because
>> the PTE table could have been removed in the meantime? At least it sounds
>> more future proof if we ever have other means of page table reclaim.
>>
>> It's not quite clear if holding the PTE table lock is really required:
>> what if someone else obtains the lock just after we unlock it? But we'll
>> leave that as is for now, maybe there are good reasons.
>>
>> This is a preparation for adapting hugetlb page table locking logic to
>> take the same locks as core-mm page table walkers would.
>>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
> 
> This patch landed in today's linux-next as commit e98970a1d2d4 ("mm: let
> pte_lockptr() consume a pte_t pointer"). Unfortunately it causes the
> following issue on most of my ARM 32bit based test boards:
> 

That is ... rather surprising.

The issue below seems to point at __pte_offset_map_lock(), where we 
essentially convert from

ptlock_ptr(page_ptdesc(pmd_page(*pmd)));

to

ptlock_ptr(virt_to_ptdesc(pte));


So we would get a NULL ptr from the ptdesc. Either the lock would 
actually not be allocated or virt_to_ptdesc() does something unexpected.

Leaf page tables on arm are also a single page, so we cannot possibly be 
staring at the wrong page-table-subpage.

Do we maybe have to special-case init-mm? But I don't see how that 
special-casing would have happened for now :/

What's your kernel config value of SPLIT_PTLOCK_CPUS?
David Hildenbrand July 30, 2024, 3:49 p.m. UTC | #16
On 30.07.24 17:45, David Hildenbrand wrote:
> On 30.07.24 17:30, Marek Szyprowski wrote:
>> On 25.07.2024 20:39, David Hildenbrand wrote:
>>> pte_lockptr() is the only *_lockptr() function that doesn't consume
>>> what would be expected: it consumes a pmd_t pointer instead of a pte_t
>>> pointer.
>>>
>>> Let's change that. The two callers in pgtable-generic.c are easily
>>> adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
>>> pte_offset_map_nolock() to obtain the lock, even though we won't actually
>>> be traversing the page table.
>>>
>>> This makes the code more similar to the other variants and avoids other
>>> hacks to make the new pte_lockptr() version happy. pte_lockptr() users
>>> reside now only in  pgtable-generic.c.
>>>
>>> Maybe, using pte_offset_map_nolock() is the right thing to do because
>>> the PTE table could have been removed in the meantime? At least it sounds
>>> more future proof if we ever have other means of page table reclaim.
>>>
>>> It's not quite clear if holding the PTE table lock is really required:
>>> what if someone else obtains the lock just after we unlock it? But we'll
>>> leave that as is for now, maybe there are good reasons.
>>>
>>> This is a preparation for adapting hugetlb page table locking logic to
>>> take the same locks as core-mm page table walkers would.
>>>
>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>
>> This patch landed in today's linux-next as commit e98970a1d2d4 ("mm: let
>> pte_lockptr() consume a pte_t pointer"). Unfortunately it causes the
>> following issue on most of my ARM 32bit based test boards:
>>
> 
> That is ... rather surprising.
> 
> The issue below seems to point at __pte_offset_map_lock(), where we
> essentially convert from
> 
> ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
> 
> to
> 
> ptlock_ptr(virt_to_ptdesc(pte));

I'm wondering, is highmem involved here such that the PTE would be 
kmap'ed and virt_to_page() would not do what we would expect it to do?
Marek Szyprowski July 30, 2024, 4:08 p.m. UTC | #17
On 30.07.2024 17:49, David Hildenbrand wrote:
> On 30.07.24 17:45, David Hildenbrand wrote:
>> On 30.07.24 17:30, Marek Szyprowski wrote:
>>> On 25.07.2024 20:39, David Hildenbrand wrote:
>>>> pte_lockptr() is the only *_lockptr() function that doesn't consume
>>>> what would be expected: it consumes a pmd_t pointer instead of a pte_t
>>>> pointer.
>>>>
>>>> Let's change that. The two callers in pgtable-generic.c are easily
>>>> adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
>>>> pte_offset_map_nolock() to obtain the lock, even though we won't 
>>>> actually
>>>> be traversing the page table.
>>>>
>>>> This makes the code more similar to the other variants and avoids 
>>>> other
>>>> hacks to make the new pte_lockptr() version happy. pte_lockptr() users
>>>> reside now only in  pgtable-generic.c.
>>>>
>>>> Maybe, using pte_offset_map_nolock() is the right thing to do because
>>>> the PTE table could have been removed in the meantime? At least it 
>>>> sounds
>>>> more future proof if we ever have other means of page table reclaim.
>>>>
>>>> It's not quite clear if holding the PTE table lock is really required:
>>>> what if someone else obtains the lock just after we unlock it? But 
>>>> we'll
>>>> leave that as is for now, maybe there are good reasons.
>>>>
>>>> This is a preparation for adapting hugetlb page table locking logic to
>>>> take the same locks as core-mm page table walkers would.
>>>>
>>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>>
>>> This patch landed in today's linux-next as commit e98970a1d2d4 ("mm: 
>>> let
>>> pte_lockptr() consume a pte_t pointer"). Unfortunately it causes the
>>> following issue on most of my ARM 32bit based test boards:
>>>
>>
>> That is ... rather surprising.
>>
>> The issue below seems to point at __pte_offset_map_lock(), where we
>> essentially convert from
>>
>> ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
>>
>> to
>>
>> ptlock_ptr(virt_to_ptdesc(pte));
>
> I'm wondering, is highmem involved here such that the PTE would be 
> kmap'ed and virt_to_page() would not do what we would expect it to do?

Yes, highmem is enabled on those boards and all of them have 1GB+ of 
RAM. For other kernel configuration options see 
arch/arm/configs/exynos_defconfig.


Best regards
David Hildenbrand July 30, 2024, 4:10 p.m. UTC | #18
On 30.07.24 18:08, Marek Szyprowski wrote:
> On 30.07.2024 17:49, David Hildenbrand wrote:
>> On 30.07.24 17:45, David Hildenbrand wrote:
>>> On 30.07.24 17:30, Marek Szyprowski wrote:
>>>> On 25.07.2024 20:39, David Hildenbrand wrote:
>>>>> pte_lockptr() is the only *_lockptr() function that doesn't consume
>>>>> what would be expected: it consumes a pmd_t pointer instead of a pte_t
>>>>> pointer.
>>>>>
>>>>> Let's change that. The two callers in pgtable-generic.c are easily
>>>>> adjusted. Adjust khugepaged.c:retract_page_tables() to simply do a
>>>>> pte_offset_map_nolock() to obtain the lock, even though we won't
>>>>> actually
>>>>> be traversing the page table.
>>>>>
>>>>> This makes the code more similar to the other variants and avoids
>>>>> other
>>>>> hacks to make the new pte_lockptr() version happy. pte_lockptr() users
>>>>> reside now only in  pgtable-generic.c.
>>>>>
>>>>> Maybe, using pte_offset_map_nolock() is the right thing to do because
>>>>> the PTE table could have been removed in the meantime? At least it
>>>>> sounds
>>>>> more future proof if we ever have other means of page table reclaim.
>>>>>
>>>>> It's not quite clear if holding the PTE table lock is really required:
>>>>> what if someone else obtains the lock just after we unlock it? But
>>>>> we'll
>>>>> leave that as is for now, maybe there are good reasons.
>>>>>
>>>>> This is a preparation for adapting hugetlb page table locking logic to
>>>>> take the same locks as core-mm page table walkers would.
>>>>>
>>>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>>>
>>>> This patch landed in today's linux-next as commit e98970a1d2d4 ("mm:
>>>> let
>>>> pte_lockptr() consume a pte_t pointer"). Unfortunately it causes the
>>>> following issue on most of my ARM 32bit based test boards:
>>>>
>>>
>>> That is ... rather surprising.
>>>
>>> The issue below seems to point at __pte_offset_map_lock(), where we
>>> essentially convert from
>>>
>>> ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
>>>
>>> to
>>>
>>> ptlock_ptr(virt_to_ptdesc(pte));
>>
>> I'm wondering, is highmem involved here such that the PTE would be
>> kmap'ed and virt_to_page() would not do what we would expect it to do?
> 
> Yes, highmem is enabled on those boards and all of them have 1GB+ of
> RAM. For other kernel configuration options see
> arch/arm/configs/exynos_defconfig.

Yes, pretty sure that's it. virt_to_page() won't work on kmaped pages.
So looks like we cannot easily do the conversion in this patch. :(

We'll have to get hacky in patch #2 instead.

@Andrew, can you drop both patches for now? I'll send a v2 that
essentially does in v2 on top something like:
  

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index da800e56fe590..c2e330b1eee21 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -963,7 +963,13 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
          * will give us the same result: the per-MM PT lock.
          */
         if (huge_page_size(h) < PMD_SIZE)
-               return pte_lockptr(mm, pte);
+               /*
+                * pte_lockptr() needs the PMD, which we don't have. Because of
+                * highmem we cannot convert pte_lockptr() to consume a pte.
+                * But as we never have highmem page tables in hugetlb, we can
+                * safely use virt_to_ptdesc() here.
+                */
+               return ptlock_ptr(virt_to_ptdesc(pte));
         else if (huge_page_size(h) < PUD_SIZE)
                 return pmd_lockptr(mm, (pmd_t *) pte);
         return pud_lockptr(mm, (pud_t *) pte);
Peter Xu July 30, 2024, 6:44 p.m. UTC | #19
On Mon, Jul 29, 2024 at 07:46:26PM +0200, David Hildenbrand wrote:
> I see what you mean but this is a very similar pattern as used in
> collapse_pte_mapped_thp(), no? There we have
> 
> start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
> ...
> if (!pml)
> 	spin_lock(ptl);
> ...
> pte_unmap(start_pte);
> if (!pml)
> 	spin_unlock(ptl);
> 
> 
> Again, I don't have a strong opinion on this, but doing it more similar to
> collapse_pte_mapped_thp() to obtain locks makes it clearer to me. But if I
> am missing something obvious please shout and I'll change it.

Right.. I don't think that path can change the pte pgtable either, and
there is even the line Hugh left showing it's impossible:

	if (!start_pte)		/* mmap_lock + page lock should prevent this */
		goto abort;

I was thinking maybe the page lock is the critical one, irrelevant of mmap
lock.

No strong opinion either.  Not sure whether Hugh has some thoughts.  But
maybe if we stick with pte_offset_map_nolock() and if there'll be a repost
anyway, we could add a similar comment like this one showing that the pte
pgtable should be actually as stable as the ptlock.

Thanks,
David Hildenbrand July 30, 2024, 7:49 p.m. UTC | #20
On 30.07.24 20:44, Peter Xu wrote:
> On Mon, Jul 29, 2024 at 07:46:26PM +0200, David Hildenbrand wrote:
>> I see what you mean but this is a very similar pattern as used in
>> collapse_pte_mapped_thp(), no? There we have
>>
>> start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
>> ...
>> if (!pml)
>> 	spin_lock(ptl);
>> ...
>> pte_unmap(start_pte);
>> if (!pml)
>> 	spin_unlock(ptl);
>>
>>
>> Again, I don't have a strong opinion on this, but doing it more similar to
>> collapse_pte_mapped_thp() to obtain locks makes it clearer to me. But if I
>> am missing something obvious please shout and I'll change it.
> 
> Right.. I don't think that path can change the pte pgtable either, and
> there is even the line Hugh left showing it's impossible:
> 
> 	if (!start_pte)		/* mmap_lock + page lock should prevent this */
> 		goto abort;
> 
> I was thinking maybe the page lock is the critical one, irrelevant of mmap
> lock.
> 
> No strong opinion either.  Not sure whether Hugh has some thoughts.  But
> maybe if we stick with pte_offset_map_nolock() and if there'll be a repost
> anyway, we could add a similar comment like this one showing that the pte
> pgtable should be actually as stable as the ptlock.

I think this all deserves some future cleanups :)

... for the time being I'll have to drop this patch completely. I should 
have known that virt_to_page() does not work on kmap'ed pages, but part 
of me didn't want to believe it.

CONFIG_HIGHPTE wants to make my life challenging :)
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2c6ccf088c7be..0472a5090b180 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2873,9 +2873,10 @@  static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
 }
 #endif /* ALLOC_SPLIT_PTLOCKS */
 
-static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
+static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pte_t *pte)
 {
-	return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
+	/* PTE page tables don't currently exceed a single page. */
+	return ptlock_ptr(virt_to_ptdesc(pte));
 }
 
 static inline bool ptlock_init(struct ptdesc *ptdesc)
@@ -2898,7 +2899,7 @@  static inline bool ptlock_init(struct ptdesc *ptdesc)
 /*
  * We use mm->page_table_lock to guard all pagetable pages of the mm.
  */
-static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
+static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pte_t *pte)
 {
 	return &mm->page_table_lock;
 }
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index cdd1d8655a76b..f3b3db1046155 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1697,12 +1697,13 @@  static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 	i_mmap_lock_read(mapping);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		struct mmu_notifier_range range;
+		bool retracted = false;
 		struct mm_struct *mm;
 		unsigned long addr;
 		pmd_t *pmd, pgt_pmd;
 		spinlock_t *pml;
 		spinlock_t *ptl;
-		bool skipped_uffd = false;
+		pte_t *pte;
 
 		/*
 		 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
@@ -1739,9 +1740,17 @@  static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 		mmu_notifier_invalidate_range_start(&range);
 
 		pml = pmd_lock(mm, pmd);
-		ptl = pte_lockptr(mm, pmd);
+
+		/*
+		 * No need to check the PTE table content, but we'll grab the
+		 * PTE table lock while we zap it.
+		 */
+		pte = pte_offset_map_nolock(mm, pmd, addr, &ptl);
+		if (!pte)
+			goto unlock_pmd;
 		if (ptl != pml)
 			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+		pte_unmap(pte);
 
 		/*
 		 * Huge page lock is still held, so normally the page table
@@ -1752,20 +1761,20 @@  static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 		 * repeating the anon_vma check protects from one category,
 		 * and repeating the userfaultfd_wp() check from another.
 		 */
-		if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) {
-			skipped_uffd = true;
-		} else {
+		if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
 			pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
 			pmdp_get_lockless_sync();
+			retracted = true;
 		}
 
 		if (ptl != pml)
 			spin_unlock(ptl);
+unlock_pmd:
 		spin_unlock(pml);
 
 		mmu_notifier_invalidate_range_end(&range);
 
-		if (!skipped_uffd) {
+		if (retracted) {
 			mm_dec_nr_ptes(mm);
 			page_table_check_pte_clear_range(mm, addr, pgt_pmd);
 			pte_free_defer(mm, pmd_pgtable(pgt_pmd));
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index a78a4adf711ac..13a7705df3f87 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -313,7 +313,7 @@  pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd,
 
 	pte = __pte_offset_map(pmd, addr, &pmdval);
 	if (likely(pte))
-		*ptlp = pte_lockptr(mm, &pmdval);
+		*ptlp = pte_lockptr(mm, pte);
 	return pte;
 }
 
@@ -371,7 +371,7 @@  pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
 	pte = __pte_offset_map(pmd, addr, &pmdval);
 	if (unlikely(!pte))
 		return pte;
-	ptl = pte_lockptr(mm, &pmdval);
+	ptl = pte_lockptr(mm, pte);
 	spin_lock(ptl);
 	if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
 		*ptlp = ptl;