diff mbox series

[v5,7/9] mm/mremap: Move TLB flush outside page table lock

Message ID 20210422054323.150993-8-aneesh.kumar@linux.ibm.com (mailing list archive)
State New, archived
Headers show
Series Speedup mremap on ppc64 | expand

Commit Message

Aneesh Kumar K.V April 22, 2021, 5:43 a.m. UTC
Move TLB flush outside page table lock so that kernel does
less with page table lock held. Releasing the ptl with old
TLB contents still valid will behave such that such access
happened before the level3 or level2 entry update.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 mm/mremap.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

Comments

Aneesh Kumar K.V May 20, 2021, 3:26 p.m. UTC | #1
On 4/22/21 11:13 AM, Aneesh Kumar K.V wrote:
> Move TLB flush outside page table lock so that kernel does
> less with page table lock held. Releasing the ptl with old
> TLB contents still valid will behave such that such access
> happened before the level3 or level2 entry update.
> 


Ok this break the page lifetime rule

commit: eb66ae030829 ("mremap: properly flush TLB before releasing the 
page")

I will respin dropping this change and add a comment around explaining 
why we need to do tlb flush before dropping ptl.

> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>   mm/mremap.c | 8 ++++----
>   1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 109560977944..9effca76bf17 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -258,7 +258,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>   	 * We don't have to worry about the ordering of src and dst
>   	 * ptlocks because exclusive mmap_lock prevents deadlock.
>   	 */
> -	old_ptl = pmd_lock(vma->vm_mm, old_pmd);
> +	old_ptl = pmd_lock(mm, old_pmd);
>   	new_ptl = pmd_lockptr(mm, new_pmd);
>   	if (new_ptl != old_ptl)
>   		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
> @@ -270,11 +270,11 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
>   	VM_BUG_ON(!pmd_none(*new_pmd));
>   	pmd_populate(mm, new_pmd, (pgtable_t)pmd_page_vaddr(pmd));
>   
> -	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE);
>   	if (new_ptl != old_ptl)
>   		spin_unlock(new_ptl);
>   	spin_unlock(old_ptl);
>   
> +	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE);
>   	return true;
>   }
>   #else
> @@ -305,7 +305,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
>   	 * We don't have to worry about the ordering of src and dst
>   	 * ptlocks because exclusive mmap_lock prevents deadlock.
>   	 */
> -	old_ptl = pud_lock(vma->vm_mm, old_pud);
> +	old_ptl = pud_lock(mm, old_pud);
>   	new_ptl = pud_lockptr(mm, new_pud);
>   	if (new_ptl != old_ptl)
>   		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
> @@ -317,11 +317,11 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
>   	VM_BUG_ON(!pud_none(*new_pud));
>   
>   	pud_populate(mm, new_pud, (pmd_t *)pud_page_vaddr(pud));
> -	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PUD_SIZE);
>   	if (new_ptl != old_ptl)
>   		spin_unlock(new_ptl);
>   	spin_unlock(old_ptl);
>   
> +	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PUD_SIZE);
>   	return true;
>   }
>   #else
>
Aneesh Kumar K.V May 20, 2021, 4:57 p.m. UTC | #2
On 5/20/21 8:56 PM, Aneesh Kumar K.V wrote:
> On 4/22/21 11:13 AM, Aneesh Kumar K.V wrote:
>> Move TLB flush outside page table lock so that kernel does
>> less with page table lock held. Releasing the ptl with old
>> TLB contents still valid will behave such that such access
>> happened before the level3 or level2 entry update.
>>
> 
> 
> Ok this break the page lifetime rule
> 
> commit: eb66ae030829 ("mremap: properly flush TLB before releasing the 
> page")
> 
> I will respin dropping this change and add a comment around explaining 
> why we need to do tlb flush before dropping ptl.

Wondering whether this is correct considering we are holding mmap_sem in 
write mode in mremap. Can a parallel free/zap happen?

> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>   mm/mremap.c | 8 ++++----
>>   1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/mremap.c b/mm/mremap.c
>> index 109560977944..9effca76bf17 100644
>> --- a/mm/mremap.c
>> +++ b/mm/mremap.c
>> @@ -258,7 +258,7 @@ static bool move_normal_pmd(struct vm_area_struct 
>> *vma, unsigned long old_addr,
>>        * We don't have to worry about the ordering of src and dst
>>        * ptlocks because exclusive mmap_lock prevents deadlock.
>>        */
>> -    old_ptl = pmd_lock(vma->vm_mm, old_pmd);
>> +    old_ptl = pmd_lock(mm, old_pmd);
>>       new_ptl = pmd_lockptr(mm, new_pmd);
>>       if (new_ptl != old_ptl)
>>           spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>> @@ -270,11 +270,11 @@ static bool move_normal_pmd(struct 
>> vm_area_struct *vma, unsigned long old_addr,
>>       VM_BUG_ON(!pmd_none(*new_pmd));
>>       pmd_populate(mm, new_pmd, (pgtable_t)pmd_page_vaddr(pmd));
>> -    flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE);
>>       if (new_ptl != old_ptl)
>>           spin_unlock(new_ptl);
>>       spin_unlock(old_ptl);
>> +    flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE);
>>       return true;
>>   }
>>   #else
>> @@ -305,7 +305,7 @@ static bool move_normal_pud(struct vm_area_struct 
>> *vma, unsigned long old_addr,
>>        * We don't have to worry about the ordering of src and dst
>>        * ptlocks because exclusive mmap_lock prevents deadlock.
>>        */
>> -    old_ptl = pud_lock(vma->vm_mm, old_pud);
>> +    old_ptl = pud_lock(mm, old_pud);
>>       new_ptl = pud_lockptr(mm, new_pud);
>>       if (new_ptl != old_ptl)
>>           spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
>> @@ -317,11 +317,11 @@ static bool move_normal_pud(struct 
>> vm_area_struct *vma, unsigned long old_addr,
>>       VM_BUG_ON(!pud_none(*new_pud));
>>       pud_populate(mm, new_pud, (pmd_t *)pud_page_vaddr(pud));
>> -    flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PUD_SIZE);
>>       if (new_ptl != old_ptl)
>>           spin_unlock(new_ptl);
>>       spin_unlock(old_ptl);
>> +    flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PUD_SIZE);
>>       return true;
>>   }
>>   #else
>>
>
Linus Torvalds May 21, 2021, 2:40 a.m. UTC | #3
On Thu, May 20, 2021 at 6:57 AM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> Wondering whether this is correct considering we are holding mmap_sem in
> write mode in mremap.

Right. So *normally* the rule is to EITHER

 - hold the mmap_sem for writing

OR

 - hold the page table lock

and that the TLB flush needs to happen before you release that lock.

But as that commit message of commit eb66ae030829 ("mremap: properly
flush TLB before releasing the page") says, "mremap()" is a bit
special. It's special because mremap() didn't take ownership of the
page - it only moved it somewhere else. So now the page-out logic -
that relies on the page table lock - can free the page immediately
after we've released the page table lock.

So basically, in order to delay the TLB flush after releasing the page
table lock, it's not really sufficient to _just_ hold the mmap_sem for
writing. You also need to guarantee that the lifetime of the page
itself is held until after the TLB flush.

For normal operations like "munmap()", this happens naturally, because
we remove the page from the page table, and add it to the list of
pages to be freed after the TLB flush.

But mremap never did that "remove the page and add it to a list to be
free'd later". Instead, it just moved the page somewhere else. And
thus there is no guarantee that the page that got moved will continue
to exist until a TLB flush is done.

So mremap does need to flush the TLB before releasing the page table
lock, because that's the lifetime boundary for the page that got
moved.

                  Linus
Aneesh Kumar K.V May 21, 2021, 3:03 a.m. UTC | #4
On 5/21/21 8:10 AM, Linus Torvalds wrote:
> On Thu, May 20, 2021 at 6:57 AM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> Wondering whether this is correct considering we are holding mmap_sem in
>> write mode in mremap.
> 
> Right. So *normally* the rule is to EITHER
> 
>   - hold the mmap_sem for writing
> 
> OR
> 
>   - hold the page table lock
> 
> and that the TLB flush needs to happen before you release that lock.
> 
> But as that commit message of commit eb66ae030829 ("mremap: properly
> flush TLB before releasing the page") says, "mremap()" is a bit
> special. It's special because mremap() didn't take ownership of the
> page - it only moved it somewhere else. So now the page-out logic -
> that relies on the page table lock - can free the page immediately
> after we've released the page table lock.
> 
> So basically, in order to delay the TLB flush after releasing the page
> table lock, it's not really sufficient to _just_ hold the mmap_sem for
> writing. You also need to guarantee that the lifetime of the page
> itself is held until after the TLB flush.
> 
> For normal operations like "munmap()", this happens naturally, because
> we remove the page from the page table, and add it to the list of
> pages to be freed after the TLB flush.
> 
> But mremap never did that "remove the page and add it to a list to be
> free'd later". Instead, it just moved the page somewhere else. And
> thus there is no guarantee that the page that got moved will continue
> to exist until a TLB flush is done.
> 
> So mremap does need to flush the TLB before releasing the page table
> lock, because that's the lifetime boundary for the page that got
> moved.

How will we avoid that happening with 
c49dd340180260c6239e453263a9a244da9a7c85 / 
2c91bd4a4e2e530582d6fd643ea7b86b27907151 . The commit improves mremap 
performance by moving level3/level2 page table entries. When doing so we 
are not holding level 4 ptl lock (pte_lock()). But rather we are holding 
pmd_lock or pud_lock(). So if we move pages around without holding the 
pte lock, won't the above issue happen even if we do a tlb flush with 
holding pmd lock/pud lock?

-aneesh
Aneesh Kumar K.V May 21, 2021, 3:28 a.m. UTC | #5
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

> On 5/21/21 8:10 AM, Linus Torvalds wrote:
>> On Thu, May 20, 2021 at 6:57 AM Aneesh Kumar K.V
>> <aneesh.kumar@linux.ibm.com> wrote:
>>>
>>> Wondering whether this is correct considering we are holding mmap_sem in
>>> write mode in mremap.
>> 
>> Right. So *normally* the rule is to EITHER
>> 
>>   - hold the mmap_sem for writing
>> 
>> OR
>> 
>>   - hold the page table lock
>> 
>> and that the TLB flush needs to happen before you release that lock.
>> 
>> But as that commit message of commit eb66ae030829 ("mremap: properly
>> flush TLB before releasing the page") says, "mremap()" is a bit
>> special. It's special because mremap() didn't take ownership of the
>> page - it only moved it somewhere else. So now the page-out logic -
>> that relies on the page table lock - can free the page immediately
>> after we've released the page table lock.
>> 
>> So basically, in order to delay the TLB flush after releasing the page
>> table lock, it's not really sufficient to _just_ hold the mmap_sem for
>> writing. You also need to guarantee that the lifetime of the page
>> itself is held until after the TLB flush.
>> 
>> For normal operations like "munmap()", this happens naturally, because
>> we remove the page from the page table, and add it to the list of
>> pages to be freed after the TLB flush.
>> 
>> But mremap never did that "remove the page and add it to a list to be
>> free'd later". Instead, it just moved the page somewhere else. And
>> thus there is no guarantee that the page that got moved will continue
>> to exist until a TLB flush is done.
>> 
>> So mremap does need to flush the TLB before releasing the page table
>> lock, because that's the lifetime boundary for the page that got
>> moved.
>
> How will we avoid that happening with 
> c49dd340180260c6239e453263a9a244da9a7c85 / 
> 2c91bd4a4e2e530582d6fd643ea7b86b27907151 . The commit improves mremap 
> performance by moving level3/level2 page table entries. When doing so we 
> are not holding level 4 ptl lock (pte_lock()). But rather we are holding 
> pmd_lock or pud_lock(). So if we move pages around without holding the 
> pte lock, won't the above issue happen even if we do a tlb flush with 
> holding pmd lock/pud lock?

This should help? ie, we flush tlb before we move pagetables to the new
address? 

modified   mm/mremap.c
@@ -277,11 +277,14 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	/* Clear the pmd */
 	pmd = *old_pmd;
 	pmd_clear(old_pmd);
-
+	/*
+	 * flush the TLB before we move the page table entries.
+	 * TLB flush includes necessary barriers.
+	 */
+	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE);
 	VM_BUG_ON(!pmd_none(*new_pmd));
 	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
 
-	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE);
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);


-aneesh
Linus Torvalds May 21, 2021, 6:13 a.m. UTC | #6
On Thu, May 20, 2021 at 5:03 PM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> On 5/21/21 8:10 AM, Linus Torvalds wrote:
> >
> > So mremap does need to flush the TLB before releasing the page table
> > lock, because that's the lifetime boundary for the page that got
> > moved.
>
> How will we avoid that happening with
> c49dd340180260c6239e453263a9a244da9a7c85 /
> 2c91bd4a4e2e530582d6fd643ea7b86b27907151 . The commit improves mremap
> performance by moving level3/level2 page table entries. When doing so we
> are not holding level 4 ptl lock (pte_lock()). But rather we are holding
> pmd_lock or pud_lock(). So if we move pages around without holding the
> pte lock, won't the above issue happen even if we do a tlb flush with
> holding pmd lock/pud lock?

Hmm. Interesting.

Your patch (to flush the TLB after clearing the old location, and
before inserting it into the new one) looks like an "obvious" fix.

But I'm putting that "obvious" in quotes, because I'm now wondering if
it actually fixes anything.

Lookie here:

 - CPU1 does a mremap of a pmd or pud.

    It clears the old pmd/pud, flushes the old TLB range, and then
inserts the pmd/pud at the new location.

 - CPU2 does a page shrinker, which calls try_to_unmap, which calls
try_to_unmap_one.

These are entirely asynchronous, because they have no shared lock. The
mremap uses the pmd lock, the try_to_unmap_one() does the rmap walk,
which does the pte lock.

Now, imagine that the following ordering happens with the two
operations above, and a CPU3 that does accesses:

 - CPU2 follows (and sees) the old page tables in the old location and
the took the pte lock

 - the mremap on CPU1 starts - cleared the old pmd, flushed the tlb,
*and* inserts in the new place.

 - a user thread on CPU3 accesses the new location and fills the TLB
of the *new* address

 - only now does CPU2 get to the "pte_get_and_clear()" to remove one page

 - CPU2 does a TLB flush and frees the page

End result:

 - both CPU1 _and_ CPU2 have flushed the TLB.

 - but both flushed the *OLD* address

 - the page is freed

 - CPU3 still has the stale TLB entry pointing to the page that is now
free and might be reused for something else

Am I missing something?

               Linus
Aneesh Kumar K.V May 21, 2021, 12:50 p.m. UTC | #7
On 5/21/21 11:43 AM, Linus Torvalds wrote:
> On Thu, May 20, 2021 at 5:03 PM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> On 5/21/21 8:10 AM, Linus Torvalds wrote:
>>>
>>> So mremap does need to flush the TLB before releasing the page table
>>> lock, because that's the lifetime boundary for the page that got
>>> moved.
>>
>> How will we avoid that happening with
>> c49dd340180260c6239e453263a9a244da9a7c85 /
>> 2c91bd4a4e2e530582d6fd643ea7b86b27907151 . The commit improves mremap
>> performance by moving level3/level2 page table entries. When doing so we
>> are not holding level 4 ptl lock (pte_lock()). But rather we are holding
>> pmd_lock or pud_lock(). So if we move pages around without holding the
>> pte lock, won't the above issue happen even if we do a tlb flush with
>> holding pmd lock/pud lock?
> 
> Hmm. Interesting.
> 
> Your patch (to flush the TLB after clearing the old location, and
> before inserting it into the new one) looks like an "obvious" fix.
> 
> But I'm putting that "obvious" in quotes, because I'm now wondering if
> it actually fixes anything.
> 
> Lookie here:
> 
>   - CPU1 does a mremap of a pmd or pud.
> 
>      It clears the old pmd/pud, flushes the old TLB range, and then
> inserts the pmd/pud at the new location.
> 
>   - CPU2 does a page shrinker, which calls try_to_unmap, which calls
> try_to_unmap_one.
> 
> These are entirely asynchronous, because they have no shared lock. The
> mremap uses the pmd lock, the try_to_unmap_one() does the rmap walk,
> which does the pte lock.
> 
> Now, imagine that the following ordering happens with the two
> operations above, and a CPU3 that does accesses:
> 
>   - CPU2 follows (and sees) the old page tables in the old location and
> the took the pte lock
> 
>   - the mremap on CPU1 starts - cleared the old pmd, flushed the tlb,
> *and* inserts in the new place.
> 
>   - a user thread on CPU3 accesses the new location and fills the TLB
> of the *new* address
> 
>   - only now does CPU2 get to the "pte_get_and_clear()" to remove one page
> 
>   - CPU2 does a TLB flush and frees the page
> 
> End result:
> 
>   - both CPU1 _and_ CPU2 have flushed the TLB.
> 
>   - but both flushed the *OLD* address
> 
>   - the page is freed
> 
>   - CPU3 still has the stale TLB entry pointing to the page that is now
> free and might be reused for something else
> 
> Am I missing something?
> 

That is a problem. With that it looks like CONFIG_HAVE_MOVE_PMD/PUD is 
broken? I don't see an easy way to fix this?

-aneesh
Aneesh Kumar K.V May 21, 2021, 1:03 p.m. UTC | #8
"Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> writes:

> On 5/21/21 11:43 AM, Linus Torvalds wrote:
>> On Thu, May 20, 2021 at 5:03 PM Aneesh Kumar K.V
>> <aneesh.kumar@linux.ibm.com> wrote:
>>>
>>> On 5/21/21 8:10 AM, Linus Torvalds wrote:
>>>>
>>>> So mremap does need to flush the TLB before releasing the page table
>>>> lock, because that's the lifetime boundary for the page that got
>>>> moved.
>>>
>>> How will we avoid that happening with
>>> c49dd340180260c6239e453263a9a244da9a7c85 /
>>> 2c91bd4a4e2e530582d6fd643ea7b86b27907151 . The commit improves mremap
>>> performance by moving level3/level2 page table entries. When doing so we
>>> are not holding level 4 ptl lock (pte_lock()). But rather we are holding
>>> pmd_lock or pud_lock(). So if we move pages around without holding the
>>> pte lock, won't the above issue happen even if we do a tlb flush with
>>> holding pmd lock/pud lock?
>> 
>> Hmm. Interesting.
>> 
>> Your patch (to flush the TLB after clearing the old location, and
>> before inserting it into the new one) looks like an "obvious" fix.
>> 
>> But I'm putting that "obvious" in quotes, because I'm now wondering if
>> it actually fixes anything.
>> 
>> Lookie here:
>> 
>>   - CPU1 does a mremap of a pmd or pud.
>> 
>>      It clears the old pmd/pud, flushes the old TLB range, and then
>> inserts the pmd/pud at the new location.
>> 
>>   - CPU2 does a page shrinker, which calls try_to_unmap, which calls
>> try_to_unmap_one.
>> 
>> These are entirely asynchronous, because they have no shared lock. The
>> mremap uses the pmd lock, the try_to_unmap_one() does the rmap walk,
>> which does the pte lock.
>> 
>> Now, imagine that the following ordering happens with the two
>> operations above, and a CPU3 that does accesses:
>> 
>>   - CPU2 follows (and sees) the old page tables in the old location and
>> the took the pte lock
>> 
>>   - the mremap on CPU1 starts - cleared the old pmd, flushed the tlb,
>> *and* inserts in the new place.
>> 
>>   - a user thread on CPU3 accesses the new location and fills the TLB
>> of the *new* address
>> 
>>   - only now does CPU2 get to the "pte_get_and_clear()" to remove one page
>> 
>>   - CPU2 does a TLB flush and frees the page
>> 
>> End result:
>> 
>>   - both CPU1 _and_ CPU2 have flushed the TLB.
>> 
>>   - but both flushed the *OLD* address
>> 
>>   - the page is freed
>> 
>>   - CPU3 still has the stale TLB entry pointing to the page that is now
>> free and might be reused for something else
>> 
>> Am I missing something?
>> 
>
> That is a problem. With that it looks like CONFIG_HAVE_MOVE_PMD/PUD is 
> broken? I don't see an easy way to fix this?

We could do MOVE_PMD with something like below? A equivalent MOVE_PUD
will be costlier which makes me wonder whether we should even support that?

diff --git a/mm/mremap.c b/mm/mremap.c
index 0270d6fed1dd..9e1e4392a1d9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -233,7 +233,7 @@ static inline bool arch_supports_page_table_move(void)
 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
 {
-	spinlock_t *old_ptl, *new_ptl;
+	spinlock_t *pte_ptl, *old_ptl, *new_ptl;
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t pmd;
 
@@ -281,8 +281,17 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	 * flush the TLB before we move the page table entries.
 	 */
 	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE);
+
+	/*
+	 * Take the ptl here so that we wait for parallel page table walk
+	 * and operations (eg: pageout) using old addr to finish.
+	 */
+	pte_ptl = pte_lockptr(mm, old_pmd);
+	spin_lock(pte_ptl);
+
 	VM_BUG_ON(!pmd_none(*new_pmd));
 	pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
+	spin_unlock(pte_ptl);
 
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
Liam R. Howlett May 21, 2021, 3:24 p.m. UTC | #9
* Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> [210521 08:51]:
> On 5/21/21 11:43 AM, Linus Torvalds wrote:
> > On Thu, May 20, 2021 at 5:03 PM Aneesh Kumar K.V
> > <aneesh.kumar@linux.ibm.com> wrote:
> > > 
> > > On 5/21/21 8:10 AM, Linus Torvalds wrote:
> > > > 
> > > > So mremap does need to flush the TLB before releasing the page table
> > > > lock, because that's the lifetime boundary for the page that got
> > > > moved.
> > > 
> > > How will we avoid that happening with
> > > c49dd340180260c6239e453263a9a244da9a7c85 /
> > > 2c91bd4a4e2e530582d6fd643ea7b86b27907151 . The commit improves mremap
> > > performance by moving level3/level2 page table entries. When doing so we
> > > are not holding level 4 ptl lock (pte_lock()). But rather we are holding
> > > pmd_lock or pud_lock(). So if we move pages around without holding the
> > > pte lock, won't the above issue happen even if we do a tlb flush with
> > > holding pmd lock/pud lock?
> > 
> > Hmm. Interesting.
> > 
> > Your patch (to flush the TLB after clearing the old location, and
> > before inserting it into the new one) looks like an "obvious" fix.
> > 
> > But I'm putting that "obvious" in quotes, because I'm now wondering if
> > it actually fixes anything.
> > 
> > Lookie here:
> > 
> >   - CPU1 does a mremap of a pmd or pud.
> > 
> >      It clears the old pmd/pud, flushes the old TLB range, and then
> > inserts the pmd/pud at the new location.
> > 
> >   - CPU2 does a page shrinker, which calls try_to_unmap, which calls
> > try_to_unmap_one.
> > 
> > These are entirely asynchronous, because they have no shared lock. The
> > mremap uses the pmd lock, the try_to_unmap_one() does the rmap walk,
> > which does the pte lock.
> > 
> > Now, imagine that the following ordering happens with the two
> > operations above, and a CPU3 that does accesses:
> > 
> >   - CPU2 follows (and sees) the old page tables in the old location and
> > the took the pte lock
> > 
> >   - the mremap on CPU1 starts - cleared the old pmd, flushed the tlb,
> > *and* inserts in the new place.
> > 
> >   - a user thread on CPU3 accesses the new location and fills the TLB
> > of the *new* address

mremap holds the mmap_sem in write mode as well, doesn't it?  How is the user thread
getting the new location?

> > 
> >   - only now does CPU2 get to the "pte_get_and_clear()" to remove one page
> > 
> >   - CPU2 does a TLB flush and frees the page
> > 
> > End result:
> > 
> >   - both CPU1 _and_ CPU2 have flushed the TLB.
> > 
> >   - but both flushed the *OLD* address
> > 
> >   - the page is freed
> > 
> >   - CPU3 still has the stale TLB entry pointing to the page that is now
> > free and might be reused for something else
> > 
> > Am I missing something?
> > 
> 
> That is a problem. With that it looks like CONFIG_HAVE_MOVE_PMD/PUD is
> broken? I don't see an easy way to fix this?
> 
> -aneesh
> 
>
Aneesh Kumar K.V May 21, 2021, 4:02 p.m. UTC | #10
On 5/21/21 8:54 PM, Liam Howlett wrote:
> * Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> [210521 08:51]:
>> On 5/21/21 11:43 AM, Linus Torvalds wrote:
>>> On Thu, May 20, 2021 at 5:03 PM Aneesh Kumar K.V
>>> <aneesh.kumar@linux.ibm.com> wrote:
>>>>
>>>> On 5/21/21 8:10 AM, Linus Torvalds wrote:
>>>>>
>>>>> So mremap does need to flush the TLB before releasing the page table
>>>>> lock, because that's the lifetime boundary for the page that got
>>>>> moved.
>>>>
>>>> How will we avoid that happening with
>>>> c49dd340180260c6239e453263a9a244da9a7c85 /
>>>> 2c91bd4a4e2e530582d6fd643ea7b86b27907151 . The commit improves mremap
>>>> performance by moving level3/level2 page table entries. When doing so we
>>>> are not holding level 4 ptl lock (pte_lock()). But rather we are holding
>>>> pmd_lock or pud_lock(). So if we move pages around without holding the
>>>> pte lock, won't the above issue happen even if we do a tlb flush with
>>>> holding pmd lock/pud lock?
>>>
>>> Hmm. Interesting.
>>>
>>> Your patch (to flush the TLB after clearing the old location, and
>>> before inserting it into the new one) looks like an "obvious" fix.
>>>
>>> But I'm putting that "obvious" in quotes, because I'm now wondering if
>>> it actually fixes anything.
>>>
>>> Lookie here:
>>>
>>>    - CPU1 does a mremap of a pmd or pud.
>>>
>>>       It clears the old pmd/pud, flushes the old TLB range, and then
>>> inserts the pmd/pud at the new location.
>>>
>>>    - CPU2 does a page shrinker, which calls try_to_unmap, which calls
>>> try_to_unmap_one.
>>>
>>> These are entirely asynchronous, because they have no shared lock. The
>>> mremap uses the pmd lock, the try_to_unmap_one() does the rmap walk,
>>> which does the pte lock.
>>>
>>> Now, imagine that the following ordering happens with the two
>>> operations above, and a CPU3 that does accesses:
>>>
>>>    - CPU2 follows (and sees) the old page tables in the old location and
>>> the took the pte lock
>>>
>>>    - the mremap on CPU1 starts - cleared the old pmd, flushed the tlb,
>>> *and* inserts in the new place.
>>>
>>>    - a user thread on CPU3 accesses the new location and fills the TLB
>>> of the *new* address
> 
> mremap holds the mmap_sem in write mode as well, doesn't it?  How is the user thread
> getting the new location?
> 

Immediately after CPU1 insert new addr translation as part of mremap, 
CPU3 can access that translation by dereferencing the new address.

-aneesh
Linus Torvalds May 21, 2021, 4:03 p.m. UTC | #11
On Fri, May 21, 2021 at 3:04 AM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> We could do MOVE_PMD with something like below? A equivalent MOVE_PUD
> will be costlier which makes me wonder whether we should even support that?

Well, without USE_SPLIT_PTE_PTLOCKS the pud case would be trivial too.
But everybody uses split pte locks in practice.

I get the feeling that the rmap code might have to use
pud_lock/pmd_lock. I wonder how painful that would be.

             Linus
Linus Torvalds May 21, 2021, 4:05 p.m. UTC | #12
On Fri, May 21, 2021 at 5:25 AM Liam Howlett <liam.howlett@oracle.com> wrote:
>
> mremap holds the mmap_sem in write mode as well, doesn't it?  How is the user thread
> getting the new location?

No amount of locking protects against the HW page table walker (or,
indeed, software ones, but they are irrelevant).

And an attacker _knows_ the new address, because that's who would be
doing the mremap() in the first place - to trigger this bug.

             Linus
Aneesh Kumar K.V May 21, 2021, 4:29 p.m. UTC | #13
On 5/21/21 9:33 PM, Linus Torvalds wrote:
> On Fri, May 21, 2021 at 3:04 AM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> We could do MOVE_PMD with something like below? A equivalent MOVE_PUD
>> will be costlier which makes me wonder whether we should even support that?
> 
> Well, without USE_SPLIT_PTE_PTLOCKS the pud case would be trivial too.
> But everybody uses split pte locks in practice.
> 

Ok I can get a patch series enabling MOVE_PUD only with 
SPLIT_PTE_PTLOCKS disabled.

> I get the feeling that the rmap code might have to use
> pud_lock/pmd_lock. I wonder how painful that would be.
> 

and work long term on that? The lock/unlocking can get complicated 
because the page_vma_walk now need to return all the held locks.


-aneesh
Aneesh Kumar K.V May 24, 2021, 2:24 p.m. UTC | #14
Linus Torvalds <torvalds@linux-foundation.org> writes:

> On Fri, May 21, 2021 at 3:04 AM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
>>
>> We could do MOVE_PMD with something like below? A equivalent MOVE_PUD
>> will be costlier which makes me wonder whether we should even support that?
>
> Well, without USE_SPLIT_PTE_PTLOCKS the pud case would be trivial too.
> But everybody uses split pte locks in practice.
>
> I get the feeling that the rmap code might have to use
> pud_lock/pmd_lock. I wonder how painful that would be.
>

Looking at this further, i guess we need to do the above to close the
race window. We do

static bool map_pte(struct page_vma_mapped_walk *pvmw)
{
	pvmw->pte = pte_offset_map(pvmw->pmd, pvmw->address);
        ..
	pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
	spin_lock(pvmw->ptl);
}

That is we walk the table without holding the pte ptl. Hence we still
can race with the optimized PMD move.

-aneesh
diff mbox series

Patch

diff --git a/mm/mremap.c b/mm/mremap.c
index 109560977944..9effca76bf17 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -258,7 +258,7 @@  static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	 * We don't have to worry about the ordering of src and dst
 	 * ptlocks because exclusive mmap_lock prevents deadlock.
 	 */
-	old_ptl = pmd_lock(vma->vm_mm, old_pmd);
+	old_ptl = pmd_lock(mm, old_pmd);
 	new_ptl = pmd_lockptr(mm, new_pmd);
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -270,11 +270,11 @@  static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 	VM_BUG_ON(!pmd_none(*new_pmd));
 	pmd_populate(mm, new_pmd, (pgtable_t)pmd_page_vaddr(pmd));
 
-	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE);
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);
 
+	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE);
 	return true;
 }
 #else
@@ -305,7 +305,7 @@  static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 	 * We don't have to worry about the ordering of src and dst
 	 * ptlocks because exclusive mmap_lock prevents deadlock.
 	 */
-	old_ptl = pud_lock(vma->vm_mm, old_pud);
+	old_ptl = pud_lock(mm, old_pud);
 	new_ptl = pud_lockptr(mm, new_pud);
 	if (new_ptl != old_ptl)
 		spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -317,11 +317,11 @@  static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
 	VM_BUG_ON(!pud_none(*new_pud));
 
 	pud_populate(mm, new_pud, (pmd_t *)pud_page_vaddr(pud));
-	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PUD_SIZE);
 	if (new_ptl != old_ptl)
 		spin_unlock(new_ptl);
 	spin_unlock(old_ptl);
 
+	flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PUD_SIZE);
 	return true;
 }
 #else