diff mbox series

[v6,updated,9/11] mm/mremap: Fix race between mremap and pageout

Message ID 20210524133818.84955-1-aneesh.kumar@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series None | expand

Commit Message

Aneesh Kumar K.V May 24, 2021, 1:38 p.m. UTC
CPU 1				CPU 2					CPU 3

mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one

				addr = old_addr
				lock(pte_ptl)
lock(pmd_ptl)
pmd = *old_pmd
pmd_clear(old_pmd)
flush_tlb_range(old_addr)

*new_pmd = pmd
									*new_addr = 10; and fills
									TLB with new addr
									and old pfn

unlock(pmd_ptl)
				ptep_get_and_clear()
				flush_tlb_range(old_addr)

				old pfn is free.
									Stale TLB entry

Avoid the above race with MOVE_PMD by holding pte ptl in mremap and waiting for
parallel pagetable walk to finish operating on pte before updating new_pmd

With MOVE_PUD only enable MOVE_PUD only if USE_SPLIT_PTE_PTLOCKS is disabled.
In this case both pte ptl and pud ptl points to mm->page_table_lock.

Fixes: c49dd3401802 ("mm: speedup mremap on 1GB or larger regions")
Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions")
Link: https://lore.kernel.org/linux-mm/CAHk-=wgXVR04eBNtxQfevontWnP6FDm+oj5vauQXP3S-huwbPw@mail.gmail.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
Change:
* Check for split PTL before taking pte ptl lock.

 mm/mremap.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

Comments

Linus Torvalds May 24, 2021, 5:16 p.m. UTC | #1
On Mon, May 24, 2021 at 3:38 AM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> Avoid the above race with MOVE_PMD by holding pte ptl in mremap and waiting for
> parallel pagetable walk to finish operating on pte before updating new_pmd

Ack on the concept.

However, not so much on the patch.

Odd whitespace change:

> @@ -254,6 +254,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>         if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
>                 return false;
>
> +
>         /*
>          * We don't have to worry about the ordering of src and dst
>          * ptlocks because exclusive mmap_lock prevents deadlock.

And new optimization for empty pmd, which seems unrelated to the
change and should presumably be separate:

> @@ -263,6 +264,10 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>         if (new_ptl != old_ptl)
>                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>
> +       if (pmd_none(*old_pmd))
> +               goto unlock_out;
> +
> +       pte_ptl = pte_lockptr(mm, old_pmd);
>         /* Clear the pmd */
>         pmd = *old_pmd;
>         pmd_clear(old_pmd);

And also, why does the above assign 'pte_ptl' without using it, when
the actual use is ten lines further down?

So I think this patch needs some cleanup.

              Linus
Aneesh Kumar K.V May 25, 2021, 8:44 a.m. UTC | #2
Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Mon, May 24, 2021 at 3:38 AM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> Avoid the above race with MOVE_PMD by holding pte ptl in mremap and waiting for
>> parallel pagetable walk to finish operating on pte before updating new_pmd
>
> Ack on the concept.


Should we worry about the below race. The window would be small

CPU 1				CPU 2					CPU 3

mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one

mmap_write_lock_killable()

				addr = old_addr

lock(pmd_ptl)
pmd = *old_pmd
pmd_clear(old_pmd)
flush_tlb_range(old_addr)

lock(pte_ptl)
*new_pmd = pmd
unlock(pte_ptl)

unlock(pmd_ptl)
				lock(pte_ptl)
									*new_addr = 10; and fills
									TLB with new addr
									and old pfn

				ptep_clear_flush(old_addr)
				old pfn is free.
									Stale TLB entry



>
> However, not so much on the patch.
>
> Odd whitespace change:
>
>> @@ -254,6 +254,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>>         if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
>>                 return false;
>>
>> +
>>         /*
>>          * We don't have to worry about the ordering of src and dst
>>          * ptlocks because exclusive mmap_lock prevents deadlock.
>
> And new optimization for empty pmd, which seems unrelated to the
> change and should presumably be separate:

That was added that we can safely do pte_lockptr() below

>
>> @@ -263,6 +264,10 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>>         if (new_ptl != old_ptl)
>>                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>>
>> +       if (pmd_none(*old_pmd))
>> +               goto unlock_out;
>> +
>> +       pte_ptl = pte_lockptr(mm, old_pmd);
>>         /* Clear the pmd */
>>         pmd = *old_pmd;
>>         pmd_clear(old_pmd);
>
> And also, why does the above assign 'pte_ptl' without using it, when
> the actual use is ten lines further down?

So that we fetch the pte_ptl before we clear old_pmd. 


-aneesh
Linus Torvalds May 25, 2021, 5:22 p.m. UTC | #3
On Mon, May 24, 2021 at 10:44 PM A lneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> Should we worry about the below race. The window would be small
>
> CPU 1                           CPU 2                                   CPU 3
>
> mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one
>
> mmap_write_lock_killable()
>
>                                 addr = old_addr
>
> lock(pmd_ptl)
> pmd = *old_pmd
> pmd_clear(old_pmd)
> flush_tlb_range(old_addr)
>
> lock(pte_ptl)
> *new_pmd = pmd
> unlock(pte_ptl)
>
> unlock(pmd_ptl)
>                                 lock(pte_ptl)
>                                                                         *new_addr = 10; and fills
>                                                                         TLB with new addr
>                                                                         and old pfn
>
>                                 ptep_clear_flush(old_addr)
>                                 old pfn is free.
>                                                                         Stale TLB entry

Hmm. Do you need a third CPU there? What is done above on CPU3 looks
like it might just be CPU1 accessing the new range immediately.

Which doesn't actually sound at all unlikely - so maybe the window is
small, but it sounds like something that could happen.

This looks nasty. The page shrinker has always been problematic
because it basically avoids the normal full set of locks.

I wonder if we could just make the page shrinker try-lock the mmap_sem
and avoid all this that way. It _is_ allowed to fail, after all, and
the page shrinker is "not normal" and should be less of a performance
issue than all the actual normal VM paths.

Does anybody have any good ideas?

> > And new optimization for empty pmd, which seems unrelated to the
> > change and should presumably be separate:
>
> That was added that we can safely do pte_lockptr() below

Oh, because pte_lockptr() doesn't actually use the "old_pmd" pointer
value - it actually *dereferences* the pointer.

That looks like a mis-design. Why does it do that? Why don't we pass
it the pmd value, if that's what it wants?

               Linus
diff mbox series

Patch

diff --git a/mm/mremap.c b/mm/mremap.c
index 8967a3707332..2fa3e0cb6176 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -224,7 +224,7 @@  static inline void flush_pte_tlb_pwc_range(struct vm_area_struct *vma,
 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
 {
-	spinlock_t *old_ptl, *new_ptl;
+	spinlock_t *pte_ptl, *old_ptl, *new_ptl;
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t pmd;
 
@@ -254,6 +254,7 @@  static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
 		return false;
 
+
 	/*
 	 * We don't have to worry about the ordering of src and dst
 	 * ptlocks because exclusive mmap_lock prevents deadlock.
@@ -263,6 +264,10 @@  static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 
+	if (pmd_none(*old_pmd))
+		goto unlock_out;
+
+	pte_ptl = pte_lockptr(mm, old_pmd);
 	/* Clear the pmd */
 	pmd = *old_pmd;
 	pmd_clear(old_pmd);
@@ -270,9 +275,20 @@  static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	 * flush the TLB before we move the page table entries.
 	 */
 	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE);
+
+	/*
+	 * Take the ptl here so that we wait for parallel page table walk
+	 * and operations (eg: pageout)using old addr to finish.
+	 */
+	if (USE_SPLIT_PTE_PTLOCKS)
+		spin_lock(pte_ptl);
+
 	VM_BUG_ON(!pmd_none(*new_pmd));
 	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
+	if (USE_SPLIT_PTE_PTLOCKS)
+		spin_unlock(pte_ptl);
 
+unlock_out:
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);
@@ -296,6 +312,14 @@  static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 	struct mm_struct *mm = vma->vm_mm;
 	pud_t pud;
 
+	/*
+	 * Disable MOVE_PUD until we get the pageout done with all
+	 * higher level page table locks held. With SPLIT_PTE_PTLOCKS
+	 * we use mm->page_table_lock for both pte ptl and pud ptl
+	 */
+	if (USE_SPLIT_PTE_PTLOCKS)
+		return false;
+
 	/*
 	 * The destination pud shouldn't be established, free_pgtables()
 	 * should have released it.