diff mbox series

[RFC,v1,3/4] mm/memory: Use ptep_get_lockless_norecency() for orig_pte

Message ID 20240215121756.2734131-4-ryan.roberts@arm.com (mailing list archive)
State New
Headers show
Series Reduce cost of ptep_get_lockless on arm64 | expand

Commit Message

Ryan Roberts Feb. 15, 2024, 12:17 p.m. UTC
Let's convert handle_pte_fault()'s use of ptep_get_lockless() to
ptep_get_lockless_norecency() to save orig_pte.

There are a number of places that follow this model:

    orig_pte = ptep_get_lockless(ptep)
    ...
    <lock>
    if (!pte_same(orig_pte, ptep_get(ptep)))
            // RACE!
    ...
    <unlock>

So we need to be careful to convert all of those to use
pte_same_norecency() so that the access and dirty bits are excluded from
the comparison.

Additionally there are a couple of places that genuinely rely on the
access and dirty bits of orig_pte, but with some careful refactoring, we
can use ptep_get() once we are holding the lock to achieve equivalent
logic.

Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
---
 mm/memory.c | 55 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 21 deletions(-)

--
2.25.1

Comments

David Hildenbrand March 26, 2024, 5:02 p.m. UTC | #1
On 15.02.24 13:17, Ryan Roberts wrote:
> Let's convert handle_pte_fault()'s use of ptep_get_lockless() to
> ptep_get_lockless_norecency() to save orig_pte.
> 
> There are a number of places that follow this model:
> 
>      orig_pte = ptep_get_lockless(ptep)
>      ...
>      <lock>
>      if (!pte_same(orig_pte, ptep_get(ptep)))
>              // RACE!
>      ...
>      <unlock>
> 
> So we need to be careful to convert all of those to use
> pte_same_norecency() so that the access and dirty bits are excluded from
> the comparison.
> 
> Additionally there are a couple of places that genuinely rely on the
> access and dirty bits of orig_pte, but with some careful refactoring, we
> can use ptep_get() once we are holding the lock to achieve equivalent
> logic.

We really should document that changed behavior somewhere where it can 
be easily found: that orig_pte might have incomplete/stale 
accessed/dirty information.


> @@ -5343,7 +5356,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
>   						 vmf->address, &vmf->ptl);
>   		if (unlikely(!vmf->pte))
>   			return 0;
> -		vmf->orig_pte = ptep_get_lockless(vmf->pte);
> +		vmf->orig_pte = ptep_get_lockless_norecency(vmf->pte);
>   		vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
> 
>   		if (pte_none(vmf->orig_pte)) {
> @@ -5363,7 +5376,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
> 
>   	spin_lock(vmf->ptl);
>   	entry = vmf->orig_pte;
> -	if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
> +	if (unlikely(!pte_same_norecency(ptep_get(vmf->pte), entry))) {
>   		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
>   		goto unlock;

I was wondering about the following:

Assume the PTE is not dirty.

Thread 1 does

vmf->orig_pte = ptep_get_lockless_norecency(vmf->pte)
/* not dirty */

/* Now, thread 2 ends up setting the PTE dirty under PT lock. */

spin_lock(vmf->ptl);
entry = vmf->orig_pte;
if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
	...
}
...
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vmf->vma, ...)
...
pte_unmap_unlock(vmf->pte, vmf->ptl);


Generic ptep_set_access_flags() will do another pte_same() check and 
realize "hey, there was a change!" let's update the PTE!

set_pte_at(vma->vm_mm, address, ptep, entry);

would overwrite the dirty bit set by thread 2.
Ryan Roberts March 26, 2024, 5:27 p.m. UTC | #2
On 26/03/2024 17:02, David Hildenbrand wrote:
> On 15.02.24 13:17, Ryan Roberts wrote:
>> Let's convert handle_pte_fault()'s use of ptep_get_lockless() to
>> ptep_get_lockless_norecency() to save orig_pte.
>>
>> There are a number of places that follow this model:
>>
>>      orig_pte = ptep_get_lockless(ptep)
>>      ...
>>      <lock>
>>      if (!pte_same(orig_pte, ptep_get(ptep)))
>>              // RACE!
>>      ...
>>      <unlock>
>>
>> So we need to be careful to convert all of those to use
>> pte_same_norecency() so that the access and dirty bits are excluded from
>> the comparison.
>>
>> Additionally there are a couple of places that genuinely rely on the
>> access and dirty bits of orig_pte, but with some careful refactoring, we
>> can use ptep_get() once we are holding the lock to achieve equivalent
>> logic.
> 
> We really should document that changed behavior somewhere where it can be easily
> found: that orig_pte might have incomplete/stale accessed/dirty information.

I could add it to the orig_pte definition in the `struct vm_fault`?

> 
> 
>> @@ -5343,7 +5356,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
>>                            vmf->address, &vmf->ptl);
>>           if (unlikely(!vmf->pte))
>>               return 0;
>> -        vmf->orig_pte = ptep_get_lockless(vmf->pte);
>> +        vmf->orig_pte = ptep_get_lockless_norecency(vmf->pte);
>>           vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
>>
>>           if (pte_none(vmf->orig_pte)) {
>> @@ -5363,7 +5376,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
>>
>>       spin_lock(vmf->ptl);
>>       entry = vmf->orig_pte;
>> -    if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
>> +    if (unlikely(!pte_same_norecency(ptep_get(vmf->pte), entry))) {
>>           update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
>>           goto unlock;
> 
> I was wondering about the following:
> 
> Assume the PTE is not dirty.
> 
> Thread 1 does

Sorry not sure what threads have to do with this? How is the vmf shared between
threads? What have I misunderstood...

> 
> vmf->orig_pte = ptep_get_lockless_norecency(vmf->pte)
> /* not dirty */
> 
> /* Now, thread 2 ends up setting the PTE dirty under PT lock. */
> 
> spin_lock(vmf->ptl);
> entry = vmf->orig_pte;
> if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
>     ...
> }
> ...
> entry = pte_mkyoung(entry);

Do you mean pte_mkdirty() here? You're talking about dirty everywhere else.

> if (ptep_set_access_flags(vmf->vma, ...)
> ...
> pte_unmap_unlock(vmf->pte, vmf->ptl);
> 
> 
> Generic ptep_set_access_flags() will do another pte_same() check and realize
> "hey, there was a change!" let's update the PTE!
> 
> set_pte_at(vma->vm_mm, address, ptep, entry);

This is called from the generic ptep_set_access_flags() in your example, right?

> 
> would overwrite the dirty bit set by thread 2.

I'm not really sure what you are getting at... Is your concern that there is a
race where the page could become dirty in the meantime and it now gets lost? I
think that's why arm64 overrides ptep_set_access_flags(); since the hw can
update access/dirty we have to deal with the races.
David Hildenbrand March 26, 2024, 5:38 p.m. UTC | #3
On 26.03.24 18:27, Ryan Roberts wrote:
> On 26/03/2024 17:02, David Hildenbrand wrote:
>> On 15.02.24 13:17, Ryan Roberts wrote:
>>> Let's convert handle_pte_fault()'s use of ptep_get_lockless() to
>>> ptep_get_lockless_norecency() to save orig_pte.
>>>
>>> There are a number of places that follow this model:
>>>
>>>       orig_pte = ptep_get_lockless(ptep)
>>>       ...
>>>       <lock>
>>>       if (!pte_same(orig_pte, ptep_get(ptep)))
>>>               // RACE!
>>>       ...
>>>       <unlock>
>>>
>>> So we need to be careful to convert all of those to use
>>> pte_same_norecency() so that the access and dirty bits are excluded from
>>> the comparison.
>>>
>>> Additionally there are a couple of places that genuinely rely on the
>>> access and dirty bits of orig_pte, but with some careful refactoring, we
>>> can use ptep_get() once we are holding the lock to achieve equivalent
>>> logic.
>>
>> We really should document that changed behavior somewhere where it can be easily
>> found: that orig_pte might have incomplete/stale accessed/dirty information.
> 
> I could add it to the orig_pte definition in the `struct vm_fault`?
> 
>>
>>
>>> @@ -5343,7 +5356,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
>>>                             vmf->address, &vmf->ptl);
>>>            if (unlikely(!vmf->pte))
>>>                return 0;
>>> -        vmf->orig_pte = ptep_get_lockless(vmf->pte);
>>> +        vmf->orig_pte = ptep_get_lockless_norecency(vmf->pte);
>>>            vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
>>>
>>>            if (pte_none(vmf->orig_pte)) {
>>> @@ -5363,7 +5376,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
>>>
>>>        spin_lock(vmf->ptl);
>>>        entry = vmf->orig_pte;
>>> -    if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
>>> +    if (unlikely(!pte_same_norecency(ptep_get(vmf->pte), entry))) {
>>>            update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
>>>            goto unlock;
>>
>> I was wondering about the following:
>>
>> Assume the PTE is not dirty.
>>
>> Thread 1 does
> 
> Sorry not sure what threads have to do with this? How is the vmf shared between
> threads? What have I misunderstood...

Assume we have a HW that does not have HW-managed access/dirty bits. One 
that ends up using generic ptep_set_access_flags(). Access/dirty bits 
are always updated under PT lock.

Then, imagine two threads. One is the the fault path here. another 
thread performs some other magic that sets the PTE dirty under PTL.

> 
>>
>> vmf->orig_pte = ptep_get_lockless_norecency(vmf->pte)
>> /* not dirty */
>>
>> /* Now, thread 2 ends up setting the PTE dirty under PT lock. */
>>
>> spin_lock(vmf->ptl);
>> entry = vmf->orig_pte;
>> if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
>>      ...
>> }
>> ...
>> entry = pte_mkyoung(entry);
> 
> Do you mean pte_mkdirty() here? You're talking about dirty everywhere else.

No, that is just thread 1 seeing "oh, nothing to do" and then goes ahead 
and unconditionally does that in handle_pte_fault().

> 
>> if (ptep_set_access_flags(vmf->vma, ...)
>> ...
>> pte_unmap_unlock(vmf->pte, vmf->ptl);
>>
>>
>> Generic ptep_set_access_flags() will do another pte_same() check and realize
>> "hey, there was a change!" let's update the PTE!
>>
>> set_pte_at(vma->vm_mm, address, ptep, entry);
> 
> This is called from the generic ptep_set_access_flags() in your example, right?
> 

Yes.

>>
>> would overwrite the dirty bit set by thread 2.
> 
> I'm not really sure what you are getting at... Is your concern that there is a
> race where the page could become dirty in the meantime and it now gets lost? I
> think that's why arm64 overrides ptep_set_access_flags(); since the hw can
> update access/dirty we have to deal with the races.

My concern is that your patch can in subtle ways lead to use losing PTE 
dirty bits on architectures that don't have the HW-managed dirty bit. 
They do exist ;)

Arm64 should be fine in that regard.
Ryan Roberts March 26, 2024, 5:48 p.m. UTC | #4
On 26/03/2024 17:38, David Hildenbrand wrote:
> On 26.03.24 18:27, Ryan Roberts wrote:
>> On 26/03/2024 17:02, David Hildenbrand wrote:
>>> On 15.02.24 13:17, Ryan Roberts wrote:
>>>> Let's convert handle_pte_fault()'s use of ptep_get_lockless() to
>>>> ptep_get_lockless_norecency() to save orig_pte.
>>>>
>>>> There are a number of places that follow this model:
>>>>
>>>>       orig_pte = ptep_get_lockless(ptep)
>>>>       ...
>>>>       <lock>
>>>>       if (!pte_same(orig_pte, ptep_get(ptep)))
>>>>               // RACE!
>>>>       ...
>>>>       <unlock>
>>>>
>>>> So we need to be careful to convert all of those to use
>>>> pte_same_norecency() so that the access and dirty bits are excluded from
>>>> the comparison.
>>>>
>>>> Additionally there are a couple of places that genuinely rely on the
>>>> access and dirty bits of orig_pte, but with some careful refactoring, we
>>>> can use ptep_get() once we are holding the lock to achieve equivalent
>>>> logic.
>>>
>>> We really should document that changed behavior somewhere where it can be easily
>>> found: that orig_pte might have incomplete/stale accessed/dirty information.
>>
>> I could add it to the orig_pte definition in the `struct vm_fault`?
>>
>>>
>>>
>>>> @@ -5343,7 +5356,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
>>>>                             vmf->address, &vmf->ptl);
>>>>            if (unlikely(!vmf->pte))
>>>>                return 0;
>>>> -        vmf->orig_pte = ptep_get_lockless(vmf->pte);
>>>> +        vmf->orig_pte = ptep_get_lockless_norecency(vmf->pte);
>>>>            vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
>>>>
>>>>            if (pte_none(vmf->orig_pte)) {
>>>> @@ -5363,7 +5376,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
>>>>
>>>>        spin_lock(vmf->ptl);
>>>>        entry = vmf->orig_pte;
>>>> -    if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
>>>> +    if (unlikely(!pte_same_norecency(ptep_get(vmf->pte), entry))) {
>>>>            update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
>>>>            goto unlock;
>>>
>>> I was wondering about the following:
>>>
>>> Assume the PTE is not dirty.
>>>
>>> Thread 1 does
>>
>> Sorry not sure what threads have to do with this? How is the vmf shared between
>> threads? What have I misunderstood...
> 
> Assume we have a HW that does not have HW-managed access/dirty bits. One that
> ends up using generic ptep_set_access_flags(). Access/dirty bits are always
> updated under PT lock.
> 
> Then, imagine two threads. One is the the fault path here. another thread
> performs some other magic that sets the PTE dirty under PTL.
> 
>>
>>>
>>> vmf->orig_pte = ptep_get_lockless_norecency(vmf->pte)
>>> /* not dirty */
>>>
>>> /* Now, thread 2 ends up setting the PTE dirty under PT lock. */

Ahh, this comment about thread 2 is not referring to the code immediately below
it. It all makes much more sense now. :)

>>>
>>> spin_lock(vmf->ptl);
>>> entry = vmf->orig_pte;
>>> if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
>>>      ...
>>> }
>>> ...
>>> entry = pte_mkyoung(entry);
>>
>> Do you mean pte_mkdirty() here? You're talking about dirty everywhere else.
> 
> No, that is just thread 1 seeing "oh, nothing to do" and then goes ahead and
> unconditionally does that in handle_pte_fault().
> 
>>
>>> if (ptep_set_access_flags(vmf->vma, ...)
>>> ...
>>> pte_unmap_unlock(vmf->pte, vmf->ptl);
>>>
>>>
>>> Generic ptep_set_access_flags() will do another pte_same() check and realize
>>> "hey, there was a change!" let's update the PTE!
>>>
>>> set_pte_at(vma->vm_mm, address, ptep, entry);
>>
>> This is called from the generic ptep_set_access_flags() in your example, right?
>>
> 
> Yes.
> 
>>>
>>> would overwrite the dirty bit set by thread 2.
>>
>> I'm not really sure what you are getting at... Is your concern that there is a
>> race where the page could become dirty in the meantime and it now gets lost? I
>> think that's why arm64 overrides ptep_set_access_flags(); since the hw can
>> update access/dirty we have to deal with the races.
> 
> My concern is that your patch can in subtle ways lead to use losing PTE dirty
> bits on architectures that don't have the HW-managed dirty bit. They do exist ;)

But I think the example you give can already happen today? Thread 1 reads
orig_pte = ptep_get_lockless(). So that's already racy, if thread 2 is going to
set dirty just after the get, then thread 1 is going to set the PTE back to (a
modified version of) orig_pte. Isn't it already broken?


> 
> Arm64 should be fine in that regard.
> 

There is plenty of arm64 HW that doesn't do HW access/dirty update. But our
ptep_set_access_flags() can always deal with a racing update, even if that
update originates from SW.

Why do I have the feeling you're about to explain (very patiently) exactly why
I'm wrong?... :)
David Hildenbrand March 26, 2024, 5:58 p.m. UTC | #5
>>>>
>>>> vmf->orig_pte = ptep_get_lockless_norecency(vmf->pte)
>>>> /* not dirty */
>>>>
>>>> /* Now, thread 2 ends up setting the PTE dirty under PT lock. */
> 
> Ahh, this comment about thread 2 is not referring to the code immediately below
> it. It all makes much more sense now. :)

Sorry :)

> 
>>>>
>>>> spin_lock(vmf->ptl);
>>>> entry = vmf->orig_pte;
>>>> if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
>>>>       ...
>>>> }
>>>> ...
>>>> entry = pte_mkyoung(entry);
>>>
>>> Do you mean pte_mkdirty() here? You're talking about dirty everywhere else.
>>
>> No, that is just thread 1 seeing "oh, nothing to do" and then goes ahead and
>> unconditionally does that in handle_pte_fault().
>>
>>>
>>>> if (ptep_set_access_flags(vmf->vma, ...)
>>>> ...
>>>> pte_unmap_unlock(vmf->pte, vmf->ptl);
>>>>
>>>>
>>>> Generic ptep_set_access_flags() will do another pte_same() check and realize
>>>> "hey, there was a change!" let's update the PTE!
>>>>
>>>> set_pte_at(vma->vm_mm, address, ptep, entry);
>>>
>>> This is called from the generic ptep_set_access_flags() in your example, right?
>>>
>>
>> Yes.
>>
>>>>
>>>> would overwrite the dirty bit set by thread 2.
>>>
>>> I'm not really sure what you are getting at... Is your concern that there is a
>>> race where the page could become dirty in the meantime and it now gets lost? I
>>> think that's why arm64 overrides ptep_set_access_flags(); since the hw can
>>> update access/dirty we have to deal with the races.
>>
>> My concern is that your patch can in subtle ways lead to use losing PTE dirty
>> bits on architectures that don't have the HW-managed dirty bit. They do exist ;)
> 
> But I think the example you give can already happen today? Thread 1 reads
> orig_pte = ptep_get_lockless(). So that's already racy, if thread 2 is going to
> set dirty just after the get, then thread 1 is going to set the PTE back to (a
> modified version of) orig_pte. Isn't it already broken?

No, because the pte_same() check under PTL would have detected it, and 
we would have backed out. And I think the problem comes to live when we 
convert pte_same()->pte_same_norecency(), because we fail to protect PTE 
access/dirty changes that happend under PTL from another thread.

But could be I am missing something :)

>> Arm64 should be fine in that regard.
>>
> 
> There is plenty of arm64 HW that doesn't do HW access/dirty update. But our
> ptep_set_access_flags() can always deal with a racing update, even if that
> update originates from SW.
> 
> Why do I have the feeling you're about to explain (very patiently) exactly why
> I'm wrong?... :)

heh ... or you'll tell me (vary patiently) why I am wrong :)
Ryan Roberts March 27, 2024, 9:51 a.m. UTC | #6
On 26/03/2024 17:58, David Hildenbrand wrote:
>>>>>
>>>>> vmf->orig_pte = ptep_get_lockless_norecency(vmf->pte)
>>>>> /* not dirty */
>>>>>
>>>>> /* Now, thread 2 ends up setting the PTE dirty under PT lock. */
>>
>> Ahh, this comment about thread 2 is not referring to the code immediately below
>> it. It all makes much more sense now. :)
> 
> Sorry :)
> 
>>
>>>>>
>>>>> spin_lock(vmf->ptl);
>>>>> entry = vmf->orig_pte;
>>>>> if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
>>>>>       ...
>>>>> }
>>>>> ...
>>>>> entry = pte_mkyoung(entry);
>>>>
>>>> Do you mean pte_mkdirty() here? You're talking about dirty everywhere else.
>>>
>>> No, that is just thread 1 seeing "oh, nothing to do" and then goes ahead and
>>> unconditionally does that in handle_pte_fault().
>>>
>>>>
>>>>> if (ptep_set_access_flags(vmf->vma, ...)
>>>>> ...
>>>>> pte_unmap_unlock(vmf->pte, vmf->ptl);
>>>>>
>>>>>
>>>>> Generic ptep_set_access_flags() will do another pte_same() check and realize
>>>>> "hey, there was a change!" let's update the PTE!
>>>>>
>>>>> set_pte_at(vma->vm_mm, address, ptep, entry);
>>>>
>>>> This is called from the generic ptep_set_access_flags() in your example, right?
>>>>
>>>
>>> Yes.
>>>
>>>>>
>>>>> would overwrite the dirty bit set by thread 2.
>>>>
>>>> I'm not really sure what you are getting at... Is your concern that there is a
>>>> race where the page could become dirty in the meantime and it now gets lost? I
>>>> think that's why arm64 overrides ptep_set_access_flags(); since the hw can
>>>> update access/dirty we have to deal with the races.
>>>
>>> My concern is that your patch can in subtle ways lead to use losing PTE dirty
>>> bits on architectures that don't have the HW-managed dirty bit. They do exist ;)
>>
>> But I think the example you give can already happen today? Thread 1 reads
>> orig_pte = ptep_get_lockless(). So that's already racy, if thread 2 is going to
>> set dirty just after the get, then thread 1 is going to set the PTE back to (a
>> modified version of) orig_pte. Isn't it already broken?
> 
> No, because the pte_same() check under PTL would have detected it, and we would
> have backed out. And I think the problem comes to live when we convert
> pte_same()->pte_same_norecency(), because we fail to protect PTE access/dirty
> changes that happend under PTL from another thread.

Ahh yep. Got it. I absolutely knew that you would be correct, but I still walked
right into it!

I think one could argue that the generic ptep_set_access_flags() is not
implementing its own spec:

"
... Only sets the access flags (dirty, accessed), as well as write permission.
Furthermore, we know it always gets set to a "more permissive" setting ...
"

Surely it should be folding the access and dirty bits from *ptep into entry if
they are set?

Regardless, I think this example proves that its fragile and subtle. I'm not
really sure how to fix it more generally/robustly. Any thoughts? If not perhaps
we are better off keeping ptep_get_lockless() around and only using
ptep_get_lockless_norecency() for the really obviously correct cases?


> 
> But could be I am missing something :)
> 
>>> Arm64 should be fine in that regard.
>>>
>>
>> There is plenty of arm64 HW that doesn't do HW access/dirty update. But our
>> ptep_set_access_flags() can always deal with a racing update, even if that
>> update originates from SW.
>>
>> Why do I have the feeling you're about to explain (very patiently) exactly why
>> I'm wrong?... :)
> 
> heh ... or you'll tell me (vary patiently) why I am wrong :)
>
David Hildenbrand March 27, 2024, 5:05 p.m. UTC | #7
On 27.03.24 10:51, Ryan Roberts wrote:
> On 26/03/2024 17:58, David Hildenbrand wrote:
>>>>>>
>>>>>> vmf->orig_pte = ptep_get_lockless_norecency(vmf->pte)
>>>>>> /* not dirty */
>>>>>>
>>>>>> /* Now, thread 2 ends up setting the PTE dirty under PT lock. */
>>>
>>> Ahh, this comment about thread 2 is not referring to the code immediately below
>>> it. It all makes much more sense now. :)
>>
>> Sorry :)
>>
>>>
>>>>>>
>>>>>> spin_lock(vmf->ptl);
>>>>>> entry = vmf->orig_pte;
>>>>>> if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
>>>>>>        ...
>>>>>> }
>>>>>> ...
>>>>>> entry = pte_mkyoung(entry);
>>>>>
>>>>> Do you mean pte_mkdirty() here? You're talking about dirty everywhere else.
>>>>
>>>> No, that is just thread 1 seeing "oh, nothing to do" and then goes ahead and
>>>> unconditionally does that in handle_pte_fault().
>>>>
>>>>>
>>>>>> if (ptep_set_access_flags(vmf->vma, ...)
>>>>>> ...
>>>>>> pte_unmap_unlock(vmf->pte, vmf->ptl);
>>>>>>
>>>>>>
>>>>>> Generic ptep_set_access_flags() will do another pte_same() check and realize
>>>>>> "hey, there was a change!" let's update the PTE!
>>>>>>
>>>>>> set_pte_at(vma->vm_mm, address, ptep, entry);
>>>>>
>>>>> This is called from the generic ptep_set_access_flags() in your example, right?
>>>>>
>>>>
>>>> Yes.
>>>>
>>>>>>
>>>>>> would overwrite the dirty bit set by thread 2.
>>>>>
>>>>> I'm not really sure what you are getting at... Is your concern that there is a
>>>>> race where the page could become dirty in the meantime and it now gets lost? I
>>>>> think that's why arm64 overrides ptep_set_access_flags(); since the hw can
>>>>> update access/dirty we have to deal with the races.
>>>>
>>>> My concern is that your patch can in subtle ways lead to use losing PTE dirty
>>>> bits on architectures that don't have the HW-managed dirty bit. They do exist ;)
>>>
>>> But I think the example you give can already happen today? Thread 1 reads
>>> orig_pte = ptep_get_lockless(). So that's already racy, if thread 2 is going to
>>> set dirty just after the get, then thread 1 is going to set the PTE back to (a
>>> modified version of) orig_pte. Isn't it already broken?
>>
>> No, because the pte_same() check under PTL would have detected it, and we would
>> have backed out. And I think the problem comes to live when we convert
>> pte_same()->pte_same_norecency(), because we fail to protect PTE access/dirty
>> changes that happend under PTL from another thread.
> 
> Ahh yep. Got it. I absolutely knew that you would be correct, but I still walked
> right into it!
> 
> I think one could argue that the generic ptep_set_access_flags() is not
> implementing its own spec:
> 
> "
> ... Only sets the access flags (dirty, accessed), as well as write permission.
> Furthermore, we know it always gets set to a "more permissive" setting ...
> "
> 
> Surely it should be folding the access and dirty bits from *ptep into entry if
> they are set?

Likely yes. Unless it's also used to clear access/dirty (don't think so, 
and would not be documented).

But the simplification made sense for now, because you previously 
checked that pte_same(), and nobody can modify it concurrently.

> 
> Regardless, I think this example proves that its fragile and subtle. I'm not
> really sure how to fix it more generally/robustly. Any thoughts? If not perhaps
> we are better off keeping ptep_get_lockless() around and only using
> ptep_get_lockless_norecency() for the really obviously correct cases?

Maybe one of the "sources of problems" is that we have a 
ptep_get_lockless_norecency() call followed by a pte_same() check, like 
done here.

Not the source of all problems I believe, though ...
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index 8e65fa1884f1..075245314ec3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3014,7 +3014,7 @@  static inline int pte_unmap_same(struct vm_fault *vmf)
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
 	if (sizeof(pte_t) > sizeof(unsigned long)) {
 		spin_lock(vmf->ptl);
-		same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
+		same = pte_same_norecency(ptep_get(vmf->pte), vmf->orig_pte);
 		spin_unlock(vmf->ptl);
 	}
 #endif
@@ -3062,11 +3062,14 @@  static inline int __wp_page_copy_user(struct page *dst, struct page *src,
 	 * take a double page fault, so mark it accessed here.
 	 */
 	vmf->pte = NULL;
-	if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
+	if (!arch_has_hw_pte_young()) {
 		pte_t entry;

 		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
-		if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+		if (likely(vmf->pte))
+			entry = ptep_get(vmf->pte);
+		if (unlikely(!vmf->pte ||
+			     !pte_same_norecency(entry, vmf->orig_pte))) {
 			/*
 			 * Other thread has already handled the fault
 			 * and update local tlb only
@@ -3077,9 +3080,11 @@  static inline int __wp_page_copy_user(struct page *dst, struct page *src,
 			goto pte_unlock;
 		}

-		entry = pte_mkyoung(vmf->orig_pte);
-		if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
-			update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
+		if (!pte_young(entry)) {
+			entry = pte_mkyoung(entry);
+			if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
+				update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
+		}
 	}

 	/*
@@ -3094,7 +3099,8 @@  static inline int __wp_page_copy_user(struct page *dst, struct page *src,

 		/* Re-validate under PTL if the page is still mapped */
 		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
-		if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+		if (unlikely(!vmf->pte ||
+			     !pte_same_norecency(ptep_get(vmf->pte), vmf->orig_pte))) {
 			/* The PTE changed under us, update local tlb */
 			if (vmf->pte)
 				update_mmu_tlb(vma, addr, vmf->pte);
@@ -3369,14 +3375,17 @@  static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 	 * Re-check the pte - we dropped the lock
 	 */
 	vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
-	if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+	if (likely(vmf->pte))
+		entry = ptep_get(vmf->pte);
+	if (likely(vmf->pte && pte_same_norecency(entry, vmf->orig_pte))) {
 		if (old_folio) {
 			if (!folio_test_anon(old_folio)) {
 				dec_mm_counter(mm, mm_counter_file(old_folio));
 				inc_mm_counter(mm, MM_ANONPAGES);
 			}
 		} else {
-			ksm_might_unmap_zero_page(mm, vmf->orig_pte);
+			/* Needs dirty bit so can't use vmf->orig_pte. */
+			ksm_might_unmap_zero_page(mm, entry);
 			inc_mm_counter(mm, MM_ANONPAGES);
 		}
 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
@@ -3494,7 +3503,7 @@  static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio
 	 * We might have raced with another page fault while we released the
 	 * pte_offset_map_lock.
 	 */
-	if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
+	if (!pte_same_norecency(ptep_get(vmf->pte), vmf->orig_pte)) {
 		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 		return VM_FAULT_NOPAGE;
@@ -3883,7 +3892,8 @@  static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)

 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
 				&vmf->ptl);
-	if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
+	if (likely(vmf->pte &&
+		   pte_same_norecency(ptep_get(vmf->pte), vmf->orig_pte)))
 		restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);

 	if (vmf->pte)
@@ -3928,7 +3938,7 @@  static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
 	 * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
 	 * So is_pte_marker() check is not enough to safely drop the pte.
 	 */
-	if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
+	if (pte_same_norecency(vmf->orig_pte, ptep_get(vmf->pte)))
 		pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
 	pte_unmap_unlock(vmf->pte, vmf->ptl);
 	return 0;
@@ -4028,8 +4038,8 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
 					vmf->address, &vmf->ptl);
 			if (unlikely(!vmf->pte ||
-				     !pte_same(ptep_get(vmf->pte),
-							vmf->orig_pte)))
+				     !pte_same_norecency(ptep_get(vmf->pte),
+							 vmf->orig_pte)))
 				goto unlock;

 			/*
@@ -4117,7 +4127,8 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 			vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
 					vmf->address, &vmf->ptl);
 			if (likely(vmf->pte &&
-				   pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
+				   pte_same_norecency(ptep_get(vmf->pte),
+						      vmf->orig_pte)))
 				ret = VM_FAULT_OOM;
 			goto unlock;
 		}
@@ -4187,7 +4198,8 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 	 */
 	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
 			&vmf->ptl);
-	if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
+	if (unlikely(!vmf->pte ||
+		     !pte_same_norecency(ptep_get(vmf->pte), vmf->orig_pte)))
 		goto out_nomap;

 	if (unlikely(!folio_test_uptodate(folio))) {
@@ -4747,7 +4759,7 @@  void set_pte_range(struct vm_fault *vmf, struct folio *folio,
 static bool vmf_pte_changed(struct vm_fault *vmf)
 {
 	if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
-		return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);
+		return !pte_same_norecency(ptep_get(vmf->pte), vmf->orig_pte);

 	return !pte_none(ptep_get(vmf->pte));
 }
@@ -5125,7 +5137,7 @@  static vm_fault_t do_numa_page(struct vm_fault *vmf)
 	 * the pfn may be screwed if the read is non atomic.
 	 */
 	spin_lock(vmf->ptl);
-	if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+	if (unlikely(!pte_same_norecency(ptep_get(vmf->pte), vmf->orig_pte))) {
 		pte_unmap_unlock(vmf->pte, vmf->ptl);
 		goto out;
 	}
@@ -5197,7 +5209,8 @@  static vm_fault_t do_numa_page(struct vm_fault *vmf)
 					       vmf->address, &vmf->ptl);
 		if (unlikely(!vmf->pte))
 			goto out;
-		if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+		if (unlikely(!pte_same_norecency(ptep_get(vmf->pte),
+							  vmf->orig_pte))) {
 			pte_unmap_unlock(vmf->pte, vmf->ptl);
 			goto out;
 		}
@@ -5343,7 +5356,7 @@  static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 						 vmf->address, &vmf->ptl);
 		if (unlikely(!vmf->pte))
 			return 0;
-		vmf->orig_pte = ptep_get_lockless(vmf->pte);
+		vmf->orig_pte = ptep_get_lockless_norecency(vmf->pte);
 		vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;

 		if (pte_none(vmf->orig_pte)) {
@@ -5363,7 +5376,7 @@  static vm_fault_t handle_pte_fault(struct vm_fault *vmf)

 	spin_lock(vmf->ptl);
 	entry = vmf->orig_pte;
-	if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
+	if (unlikely(!pte_same_norecency(ptep_get(vmf->pte), entry))) {
 		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
 		goto unlock;
 	}