diff mbox series

[v2,1/2] mm/migrate_device.c: Copy pte dirty bit to page

Message ID 6e77914685ede036c419fa65b6adc27f25a6c3e9.1660635033.git-series.apopple@nvidia.com (mailing list archive)
State New
Headers show
Series [v2,1/2] mm/migrate_device.c: Copy pte dirty bit to page | expand

Commit Message

Alistair Popple Aug. 16, 2022, 7:39 a.m. UTC
migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that
installs migration entries directly if it can lock the migrating page.
When removing a dirty pte the dirty bit is supposed to be carried over
to the underlying page to prevent it being lost.

Currently migrate_vma_*() can only be used for private anonymous
mappings. That means loss of the dirty bit usually doesn't result in
data loss because these pages are typically not file-backed. However
pages may be backed by swap storage which can result in data loss if an
attempt is made to migrate a dirty page that doesn't yet have the
PageDirty flag set.

In this case migration will fail due to unexpected references but the
dirty pte bit will be lost. If the page is subsequently reclaimed data
won't be written back to swap storage as it is considered uptodate,
resulting in data loss if the page is subsequently accessed.

Prevent this by copying the dirty bit to the page when removing the pte
to match what try_to_migrate_one() does.

Signed-off-by: Alistair Popple <apopple@nvidia.com>
Acked-by: Peter Xu <peterx@redhat.com>
Reported-by: Huang Ying <ying.huang@intel.com>
Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
Cc: stable@vger.kernel.org

---

Changes for v2:

 - Fixed up Reported-by tag.
 - Added Peter's Acked-by.
 - Atomically read and clear the pte to prevent the dirty bit getting
   set after reading it.
 - Added fixes tag
---
 mm/migrate_device.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)


base-commit: ffcf9c5700e49c0aee42dcba9a12ba21338e8136

Comments

huang ying Aug. 16, 2022, 8:10 a.m. UTC | #1
On Tue, Aug 16, 2022 at 3:39 PM Alistair Popple <apopple@nvidia.com> wrote:
>
> migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that
> installs migration entries directly if it can lock the migrating page.
> When removing a dirty pte the dirty bit is supposed to be carried over
> to the underlying page to prevent it being lost.
>
> Currently migrate_vma_*() can only be used for private anonymous
> mappings. That means loss of the dirty bit usually doesn't result in
> data loss because these pages are typically not file-backed. However
> pages may be backed by swap storage which can result in data loss if an
> attempt is made to migrate a dirty page that doesn't yet have the
> PageDirty flag set.
>
> In this case migration will fail due to unexpected references but the
> dirty pte bit will be lost. If the page is subsequently reclaimed data
> won't be written back to swap storage as it is considered uptodate,
> resulting in data loss if the page is subsequently accessed.
>
> Prevent this by copying the dirty bit to the page when removing the pte
> to match what try_to_migrate_one() does.
>
> Signed-off-by: Alistair Popple <apopple@nvidia.com>
> Acked-by: Peter Xu <peterx@redhat.com>
> Reported-by: Huang Ying <ying.huang@intel.com>
> Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
> Cc: stable@vger.kernel.org
>
> ---
>
> Changes for v2:
>
>  - Fixed up Reported-by tag.
>  - Added Peter's Acked-by.
>  - Atomically read and clear the pte to prevent the dirty bit getting
>    set after reading it.
>  - Added fixes tag
> ---
>  mm/migrate_device.c | 21 ++++++++-------------
>  1 file changed, 8 insertions(+), 13 deletions(-)
>
> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
> index 27fb37d..e2d09e5 100644
> --- a/mm/migrate_device.c
> +++ b/mm/migrate_device.c
> @@ -7,6 +7,7 @@
>  #include <linux/export.h>
>  #include <linux/memremap.h>
>  #include <linux/migrate.h>
> +#include <linux/mm.h>
>  #include <linux/mm_inline.h>
>  #include <linux/mmu_notifier.h>
>  #include <linux/oom.h>
> @@ -61,7 +62,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>         struct migrate_vma *migrate = walk->private;
>         struct vm_area_struct *vma = walk->vma;
>         struct mm_struct *mm = vma->vm_mm;
> -       unsigned long addr = start, unmapped = 0;
> +       unsigned long addr = start;
>         spinlock_t *ptl;
>         pte_t *ptep;
>
> @@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>                         bool anon_exclusive;
>                         pte_t swp_pte;
>
> +                       flush_cache_page(vma, addr, pte_pfn(*ptep));
> +                       pte = ptep_clear_flush(vma, addr, ptep);

Although I think it's possible to batch the TLB flushing just before
unlocking PTL.  The current code looks correct.

Reviewed-by: "Huang, Ying" <ying.huang@intel.com>

Best Regards,
Huang, Ying

>                         anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
>                         if (anon_exclusive) {
> -                               flush_cache_page(vma, addr, pte_pfn(*ptep));
> -                               ptep_clear_flush(vma, addr, ptep);
> -
>                                 if (page_try_share_anon_rmap(page)) {
>                                         set_pte_at(mm, addr, ptep, pte);
>                                         unlock_page(page);
> @@ -205,12 +205,14 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>                                         mpfn = 0;
>                                         goto next;
>                                 }
> -                       } else {
> -                               ptep_get_and_clear(mm, addr, ptep);
>                         }
>
>                         migrate->cpages++;
>
> +                       /* Set the dirty flag on the folio now the pte is gone. */
> +                       if (pte_dirty(pte))
> +                               folio_mark_dirty(page_folio(page));
> +
>                         /* Setup special migration page table entry */
>                         if (mpfn & MIGRATE_PFN_WRITE)
>                                 entry = make_writable_migration_entry(
> @@ -242,9 +244,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>                          */
>                         page_remove_rmap(page, vma, false);
>                         put_page(page);
> -
> -                       if (pte_present(pte))
> -                               unmapped++;
>                 } else {
>                         put_page(page);
>                         mpfn = 0;
> @@ -257,10 +256,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>         arch_leave_lazy_mmu_mode();
>         pte_unmap_unlock(ptep - 1, ptl);
>
> -       /* Only flush the TLB if we actually modified any entries */
> -       if (unmapped)
> -               flush_tlb_range(walk->vma, start, end);
> -
>         return 0;
>  }
>
>
> base-commit: ffcf9c5700e49c0aee42dcba9a12ba21338e8136
> --
> git-series 0.9.1
>
Peter Xu Aug. 16, 2022, 8:35 p.m. UTC | #2
On Tue, Aug 16, 2022 at 04:10:29PM +0800, huang ying wrote:
> > @@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >                         bool anon_exclusive;
> >                         pte_t swp_pte;
> >
> > +                       flush_cache_page(vma, addr, pte_pfn(*ptep));
> > +                       pte = ptep_clear_flush(vma, addr, ptep);
> 
> Although I think it's possible to batch the TLB flushing just before
> unlocking PTL.  The current code looks correct.

If we're with unconditionally ptep_clear_flush(), does it mean we should
probably drop the "unmapped" and the last flush_tlb_range() already since
they'll be redundant?

If that'll need to be dropped, it looks indeed better to still keep the
batch to me but just move it earlier (before unlock iiuc then it'll be
safe), then we can keep using ptep_get_and_clear() afaiu but keep "pte"
updated.

Thanks,
Alistair Popple Aug. 17, 2022, 1:38 a.m. UTC | #3
huang ying <huang.ying.caritas@gmail.com> writes:

> On Tue, Aug 16, 2022 at 3:39 PM Alistair Popple <apopple@nvidia.com> wrote:
>>
>> migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that
>> installs migration entries directly if it can lock the migrating page.
>> When removing a dirty pte the dirty bit is supposed to be carried over
>> to the underlying page to prevent it being lost.
>>
>> Currently migrate_vma_*() can only be used for private anonymous
>> mappings. That means loss of the dirty bit usually doesn't result in
>> data loss because these pages are typically not file-backed. However
>> pages may be backed by swap storage which can result in data loss if an
>> attempt is made to migrate a dirty page that doesn't yet have the
>> PageDirty flag set.
>>
>> In this case migration will fail due to unexpected references but the
>> dirty pte bit will be lost. If the page is subsequently reclaimed data
>> won't be written back to swap storage as it is considered uptodate,
>> resulting in data loss if the page is subsequently accessed.
>>
>> Prevent this by copying the dirty bit to the page when removing the pte
>> to match what try_to_migrate_one() does.
>>
>> Signed-off-by: Alistair Popple <apopple@nvidia.com>
>> Acked-by: Peter Xu <peterx@redhat.com>
>> Reported-by: Huang Ying <ying.huang@intel.com>
>> Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages")
>> Cc: stable@vger.kernel.org
>>
>> ---
>>
>> Changes for v2:
>>
>>  - Fixed up Reported-by tag.
>>  - Added Peter's Acked-by.
>>  - Atomically read and clear the pte to prevent the dirty bit getting
>>    set after reading it.
>>  - Added fixes tag
>> ---
>>  mm/migrate_device.c | 21 ++++++++-------------
>>  1 file changed, 8 insertions(+), 13 deletions(-)
>>
>> diff --git a/mm/migrate_device.c b/mm/migrate_device.c
>> index 27fb37d..e2d09e5 100644
>> --- a/mm/migrate_device.c
>> +++ b/mm/migrate_device.c
>> @@ -7,6 +7,7 @@
>>  #include <linux/export.h>
>>  #include <linux/memremap.h>
>>  #include <linux/migrate.h>
>> +#include <linux/mm.h>
>>  #include <linux/mm_inline.h>
>>  #include <linux/mmu_notifier.h>
>>  #include <linux/oom.h>
>> @@ -61,7 +62,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>         struct migrate_vma *migrate = walk->private;
>>         struct vm_area_struct *vma = walk->vma;
>>         struct mm_struct *mm = vma->vm_mm;
>> -       unsigned long addr = start, unmapped = 0;
>> +       unsigned long addr = start;
>>         spinlock_t *ptl;
>>         pte_t *ptep;
>>
>> @@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>                         bool anon_exclusive;
>>                         pte_t swp_pte;
>>
>> +                       flush_cache_page(vma, addr, pte_pfn(*ptep));
>> +                       pte = ptep_clear_flush(vma, addr, ptep);
>
> Although I think it's possible to batch the TLB flushing just before
> unlocking PTL.  The current code looks correct.

I think you might be right but I'd rather deal with batch TLB flushing
as a separate change that implements it for normal migration as well
given we don't seem to do it there either.

> Reviewed-by: "Huang, Ying" <ying.huang@intel.com>

Thanks.

> Best Regards,
> Huang, Ying
>
>>                         anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
>>                         if (anon_exclusive) {
>> -                               flush_cache_page(vma, addr, pte_pfn(*ptep));
>> -                               ptep_clear_flush(vma, addr, ptep);
>> -
>>                                 if (page_try_share_anon_rmap(page)) {
>>                                         set_pte_at(mm, addr, ptep, pte);
>>                                         unlock_page(page);
>> @@ -205,12 +205,14 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>                                         mpfn = 0;
>>                                         goto next;
>>                                 }
>> -                       } else {
>> -                               ptep_get_and_clear(mm, addr, ptep);
>>                         }
>>
>>                         migrate->cpages++;
>>
>> +                       /* Set the dirty flag on the folio now the pte is gone. */
>> +                       if (pte_dirty(pte))
>> +                               folio_mark_dirty(page_folio(page));
>> +
>>                         /* Setup special migration page table entry */
>>                         if (mpfn & MIGRATE_PFN_WRITE)
>>                                 entry = make_writable_migration_entry(
>> @@ -242,9 +244,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>                          */
>>                         page_remove_rmap(page, vma, false);
>>                         put_page(page);
>> -
>> -                       if (pte_present(pte))
>> -                               unmapped++;
>>                 } else {
>>                         put_page(page);
>>                         mpfn = 0;
>> @@ -257,10 +256,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>         arch_leave_lazy_mmu_mode();
>>         pte_unmap_unlock(ptep - 1, ptl);
>>
>> -       /* Only flush the TLB if we actually modified any entries */
>> -       if (unmapped)
>> -               flush_tlb_range(walk->vma, start, end);
>> -
>>         return 0;
>>  }
>>
>>
>> base-commit: ffcf9c5700e49c0aee42dcba9a12ba21338e8136
>> --
>> git-series 0.9.1
>>
Alistair Popple Aug. 17, 2022, 1:49 a.m. UTC | #4
Peter Xu <peterx@redhat.com> writes:

> On Tue, Aug 16, 2022 at 04:10:29PM +0800, huang ying wrote:
>> > @@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> >                         bool anon_exclusive;
>> >                         pte_t swp_pte;
>> >
>> > +                       flush_cache_page(vma, addr, pte_pfn(*ptep));
>> > +                       pte = ptep_clear_flush(vma, addr, ptep);
>>
>> Although I think it's possible to batch the TLB flushing just before
>> unlocking PTL.  The current code looks correct.
>
> If we're with unconditionally ptep_clear_flush(), does it mean we should
> probably drop the "unmapped" and the last flush_tlb_range() already since
> they'll be redundant?

This patch does that, unless I missed something?

> If that'll need to be dropped, it looks indeed better to still keep the
> batch to me but just move it earlier (before unlock iiuc then it'll be
> safe), then we can keep using ptep_get_and_clear() afaiu but keep "pte"
> updated.

I think we would also need to check should_defer_flush(). Looking at
try_to_unmap_one() there is this comment:

			if (should_defer_flush(mm, flags) && !anon_exclusive) {
				/*
				 * We clear the PTE but do not flush so potentially
				 * a remote CPU could still be writing to the folio.
				 * If the entry was previously clean then the
				 * architecture must guarantee that a clear->dirty
				 * transition on a cached TLB entry is written through
				 * and traps if the PTE is unmapped.
				 */

And as I understand it we'd need the same guarantee here. Given
try_to_migrate_one() doesn't do batched TLB flushes either I'd rather
keep the code as consistent as possible between
migrate_vma_collect_pmd() and try_to_migrate_one(). I could look at
introducing TLB flushing for both in some future patch series.

 - Alistair

> Thanks,
Peter Xu Aug. 17, 2022, 2:45 a.m. UTC | #5
On Wed, Aug 17, 2022 at 11:49:03AM +1000, Alistair Popple wrote:
> 
> Peter Xu <peterx@redhat.com> writes:
> 
> > On Tue, Aug 16, 2022 at 04:10:29PM +0800, huang ying wrote:
> >> > @@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
> >> >                         bool anon_exclusive;
> >> >                         pte_t swp_pte;
> >> >
> >> > +                       flush_cache_page(vma, addr, pte_pfn(*ptep));
> >> > +                       pte = ptep_clear_flush(vma, addr, ptep);
> >>
> >> Although I think it's possible to batch the TLB flushing just before
> >> unlocking PTL.  The current code looks correct.
> >
> > If we're with unconditionally ptep_clear_flush(), does it mean we should
> > probably drop the "unmapped" and the last flush_tlb_range() already since
> > they'll be redundant?
> 
> This patch does that, unless I missed something?

Yes it does.  Somehow I didn't read into the real v2 patch, sorry!

> 
> > If that'll need to be dropped, it looks indeed better to still keep the
> > batch to me but just move it earlier (before unlock iiuc then it'll be
> > safe), then we can keep using ptep_get_and_clear() afaiu but keep "pte"
> > updated.
> 
> I think we would also need to check should_defer_flush(). Looking at
> try_to_unmap_one() there is this comment:
> 
> 			if (should_defer_flush(mm, flags) && !anon_exclusive) {
> 				/*
> 				 * We clear the PTE but do not flush so potentially
> 				 * a remote CPU could still be writing to the folio.
> 				 * If the entry was previously clean then the
> 				 * architecture must guarantee that a clear->dirty
> 				 * transition on a cached TLB entry is written through
> 				 * and traps if the PTE is unmapped.
> 				 */
> 
> And as I understand it we'd need the same guarantee here. Given
> try_to_migrate_one() doesn't do batched TLB flushes either I'd rather
> keep the code as consistent as possible between
> migrate_vma_collect_pmd() and try_to_migrate_one(). I could look at
> introducing TLB flushing for both in some future patch series.

should_defer_flush() is TTU-specific code?

IIUC the caller sets TTU_BATCH_FLUSH showing that tlb can be omitted since
the caller will be responsible for doing it.  In migrate_vma_collect_pmd()
iiuc we don't need that hint because it'll be flushed within the same
function but just only after the loop of modifying the ptes.  Also it'll be
with the pgtable lock held.

Indeed try_to_migrate_one() doesn't do batching either, but IMHO it's just
harder to do due to using the vma walker (e.g., the lock is released in
not_found() implicitly so iiuc it's hard to do tlb flush batching safely in
the loop of page_vma_mapped_walk).  Also that's less a concern since the
loop will only operate upon >1 ptes only if it's a thp page mapped in ptes.
OTOH migrate_vma_collect_pmd() operates on all ptes on a pmd always.

No strong opinion anyway, it's just a bit of a pity because fundamentally
this patch is removing the batching tlb flush.  I also don't know whether
there'll be observe-able perf degrade for migrate_vma_collect_pmd(),
especially on large machines.

Thanks,
Alistair Popple Aug. 17, 2022, 5:41 a.m. UTC | #6
Peter Xu <peterx@redhat.com> writes:

> On Wed, Aug 17, 2022 at 11:49:03AM +1000, Alistair Popple wrote:
>>
>> Peter Xu <peterx@redhat.com> writes:
>>
>> > On Tue, Aug 16, 2022 at 04:10:29PM +0800, huang ying wrote:
>> >> > @@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>> >> >                         bool anon_exclusive;
>> >> >                         pte_t swp_pte;
>> >> >
>> >> > +                       flush_cache_page(vma, addr, pte_pfn(*ptep));
>> >> > +                       pte = ptep_clear_flush(vma, addr, ptep);
>> >>
>> >> Although I think it's possible to batch the TLB flushing just before
>> >> unlocking PTL.  The current code looks correct.
>> >
>> > If we're with unconditionally ptep_clear_flush(), does it mean we should
>> > probably drop the "unmapped" and the last flush_tlb_range() already since
>> > they'll be redundant?
>>
>> This patch does that, unless I missed something?
>
> Yes it does.  Somehow I didn't read into the real v2 patch, sorry!
>
>>
>> > If that'll need to be dropped, it looks indeed better to still keep the
>> > batch to me but just move it earlier (before unlock iiuc then it'll be
>> > safe), then we can keep using ptep_get_and_clear() afaiu but keep "pte"
>> > updated.
>>
>> I think we would also need to check should_defer_flush(). Looking at
>> try_to_unmap_one() there is this comment:
>>
>> 			if (should_defer_flush(mm, flags) && !anon_exclusive) {
>> 				/*
>> 				 * We clear the PTE but do not flush so potentially
>> 				 * a remote CPU could still be writing to the folio.
>> 				 * If the entry was previously clean then the
>> 				 * architecture must guarantee that a clear->dirty
>> 				 * transition on a cached TLB entry is written through
>> 				 * and traps if the PTE is unmapped.
>> 				 */
>>
>> And as I understand it we'd need the same guarantee here. Given
>> try_to_migrate_one() doesn't do batched TLB flushes either I'd rather
>> keep the code as consistent as possible between
>> migrate_vma_collect_pmd() and try_to_migrate_one(). I could look at
>> introducing TLB flushing for both in some future patch series.
>
> should_defer_flush() is TTU-specific code?

I'm not sure, but I think we need the same guarantee here as mentioned
in the comment otherwise we wouldn't see a subsequent CPU write that
could dirty the PTE after we have cleared it but before the TLB flush.

My assumption was should_defer_flush() would ensure we have that
guarantee from the architecture, but maybe there are alternate/better
ways of enforcing that?

> IIUC the caller sets TTU_BATCH_FLUSH showing that tlb can be omitted since
> the caller will be responsible for doing it.  In migrate_vma_collect_pmd()
> iiuc we don't need that hint because it'll be flushed within the same
> function but just only after the loop of modifying the ptes.  Also it'll be
> with the pgtable lock held.

Right, but the pgtable lock doesn't protect against HW PTE changes such
as setting the dirty bit so we need to ensure the HW does the right
thing here and I don't know if all HW does.

> Indeed try_to_migrate_one() doesn't do batching either, but IMHO it's just
> harder to do due to using the vma walker (e.g., the lock is released in
> not_found() implicitly so iiuc it's hard to do tlb flush batching safely in
> the loop of page_vma_mapped_walk).  Also that's less a concern since the
> loop will only operate upon >1 ptes only if it's a thp page mapped in ptes.
> OTOH migrate_vma_collect_pmd() operates on all ptes on a pmd always.

Yes, I had forgotten we loop over multiple ptes under the same PTL so
didn't think removing the batching/range flush would cause all that much
of a problem.

> No strong opinion anyway, it's just a bit of a pity because fundamentally
> this patch is removing the batching tlb flush.  I also don't know whether
> there'll be observe-able perf degrade for migrate_vma_collect_pmd(),
> especially on large machines.

I agree it's a pity. OTOH the original code isn't correct, and it's not
entirely clear to me that just moving it under the PTL is entirely
correct either. So unless someone is confident and can convince me that
just moving it under the PTL is fine I'd rather stick with this fix
which we all agree is at least correct.

My primary concern with batching is ensuring a CPU write after clearing
a clean PTE but before flushing the TLB does the "right thing" (ie. faults
if the PTE is not present).

> Thanks,
Huang, Ying Aug. 17, 2022, 7:17 a.m. UTC | #7
Alistair Popple <apopple@nvidia.com> writes:

> Peter Xu <peterx@redhat.com> writes:
>
>> On Wed, Aug 17, 2022 at 11:49:03AM +1000, Alistair Popple wrote:
>>>
>>> Peter Xu <peterx@redhat.com> writes:
>>>
>>> > On Tue, Aug 16, 2022 at 04:10:29PM +0800, huang ying wrote:
>>> >> > @@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>> >> >                         bool anon_exclusive;
>>> >> >                         pte_t swp_pte;
>>> >> >
>>> >> > +                       flush_cache_page(vma, addr, pte_pfn(*ptep));
>>> >> > +                       pte = ptep_clear_flush(vma, addr, ptep);
>>> >>
>>> >> Although I think it's possible to batch the TLB flushing just before
>>> >> unlocking PTL.  The current code looks correct.
>>> >
>>> > If we're with unconditionally ptep_clear_flush(), does it mean we should
>>> > probably drop the "unmapped" and the last flush_tlb_range() already since
>>> > they'll be redundant?
>>>
>>> This patch does that, unless I missed something?
>>
>> Yes it does.  Somehow I didn't read into the real v2 patch, sorry!
>>
>>>
>>> > If that'll need to be dropped, it looks indeed better to still keep the
>>> > batch to me but just move it earlier (before unlock iiuc then it'll be
>>> > safe), then we can keep using ptep_get_and_clear() afaiu but keep "pte"
>>> > updated.
>>>
>>> I think we would also need to check should_defer_flush(). Looking at
>>> try_to_unmap_one() there is this comment:
>>>
>>> 			if (should_defer_flush(mm, flags) && !anon_exclusive) {
>>> 				/*
>>> 				 * We clear the PTE but do not flush so potentially
>>> 				 * a remote CPU could still be writing to the folio.
>>> 				 * If the entry was previously clean then the
>>> 				 * architecture must guarantee that a clear->dirty
>>> 				 * transition on a cached TLB entry is written through
>>> 				 * and traps if the PTE is unmapped.
>>> 				 */
>>>
>>> And as I understand it we'd need the same guarantee here. Given
>>> try_to_migrate_one() doesn't do batched TLB flushes either I'd rather
>>> keep the code as consistent as possible between
>>> migrate_vma_collect_pmd() and try_to_migrate_one(). I could look at
>>> introducing TLB flushing for both in some future patch series.
>>
>> should_defer_flush() is TTU-specific code?
>
> I'm not sure, but I think we need the same guarantee here as mentioned
> in the comment otherwise we wouldn't see a subsequent CPU write that
> could dirty the PTE after we have cleared it but before the TLB flush.
>
> My assumption was should_defer_flush() would ensure we have that
> guarantee from the architecture, but maybe there are alternate/better
> ways of enforcing that?
>> IIUC the caller sets TTU_BATCH_FLUSH showing that tlb can be omitted since
>> the caller will be responsible for doing it.  In migrate_vma_collect_pmd()
>> iiuc we don't need that hint because it'll be flushed within the same
>> function but just only after the loop of modifying the ptes.  Also it'll be
>> with the pgtable lock held.
>
> Right, but the pgtable lock doesn't protect against HW PTE changes such
> as setting the dirty bit so we need to ensure the HW does the right
> thing here and I don't know if all HW does.

This sounds sensible.  But I take a look at zap_pte_range(), and find
that it appears that the implementation requires the PTE dirty bit to be
write-through.  Do I miss something?

Hi, Nadav, Can you help?

Best Regards,
Huang, Ying

[snip]
Nadav Amit Aug. 17, 2022, 9:41 a.m. UTC | #8
On Aug 17, 2022, at 12:17 AM, Huang, Ying <ying.huang@intel.com> wrote:

> Alistair Popple <apopple@nvidia.com> writes:
> 
>> Peter Xu <peterx@redhat.com> writes:
>> 
>>> On Wed, Aug 17, 2022 at 11:49:03AM +1000, Alistair Popple wrote:
>>>> Peter Xu <peterx@redhat.com> writes:
>>>> 
>>>>> On Tue, Aug 16, 2022 at 04:10:29PM +0800, huang ying wrote:
>>>>>>> @@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>>>>>>                        bool anon_exclusive;
>>>>>>>                        pte_t swp_pte;
>>>>>>> 
>>>>>>> +                       flush_cache_page(vma, addr, pte_pfn(*ptep));
>>>>>>> +                       pte = ptep_clear_flush(vma, addr, ptep);
>>>>>> 
>>>>>> Although I think it's possible to batch the TLB flushing just before
>>>>>> unlocking PTL.  The current code looks correct.
>>>>> 
>>>>> If we're with unconditionally ptep_clear_flush(), does it mean we should
>>>>> probably drop the "unmapped" and the last flush_tlb_range() already since
>>>>> they'll be redundant?
>>>> 
>>>> This patch does that, unless I missed something?
>>> 
>>> Yes it does.  Somehow I didn't read into the real v2 patch, sorry!
>>> 
>>>>> If that'll need to be dropped, it looks indeed better to still keep the
>>>>> batch to me but just move it earlier (before unlock iiuc then it'll be
>>>>> safe), then we can keep using ptep_get_and_clear() afaiu but keep "pte"
>>>>> updated.
>>>> 
>>>> I think we would also need to check should_defer_flush(). Looking at
>>>> try_to_unmap_one() there is this comment:
>>>> 
>>>> 			if (should_defer_flush(mm, flags) && !anon_exclusive) {
>>>> 				/*
>>>> 				 * We clear the PTE but do not flush so potentially
>>>> 				 * a remote CPU could still be writing to the folio.
>>>> 				 * If the entry was previously clean then the
>>>> 				 * architecture must guarantee that a clear->dirty
>>>> 				 * transition on a cached TLB entry is written through
>>>> 				 * and traps if the PTE is unmapped.
>>>> 				 */
>>>> 
>>>> And as I understand it we'd need the same guarantee here. Given
>>>> try_to_migrate_one() doesn't do batched TLB flushes either I'd rather
>>>> keep the code as consistent as possible between
>>>> migrate_vma_collect_pmd() and try_to_migrate_one(). I could look at
>>>> introducing TLB flushing for both in some future patch series.
>>> 
>>> should_defer_flush() is TTU-specific code?
>> 
>> I'm not sure, but I think we need the same guarantee here as mentioned
>> in the comment otherwise we wouldn't see a subsequent CPU write that
>> could dirty the PTE after we have cleared it but before the TLB flush.
>> 
>> My assumption was should_defer_flush() would ensure we have that
>> guarantee from the architecture, but maybe there are alternate/better
>> ways of enforcing that?
>>> IIUC the caller sets TTU_BATCH_FLUSH showing that tlb can be omitted since
>>> the caller will be responsible for doing it.  In migrate_vma_collect_pmd()
>>> iiuc we don't need that hint because it'll be flushed within the same
>>> function but just only after the loop of modifying the ptes.  Also it'll be
>>> with the pgtable lock held.
>> 
>> Right, but the pgtable lock doesn't protect against HW PTE changes such
>> as setting the dirty bit so we need to ensure the HW does the right
>> thing here and I don't know if all HW does.
> 
> This sounds sensible.  But I take a look at zap_pte_range(), and find
> that it appears that the implementation requires the PTE dirty bit to be
> write-through.  Do I miss something?
> 
> Hi, Nadav, Can you help?

Sorry for joining the discussion late. I read most ofthis thread and I hope
I understand what you ask me. So at the risk of rehashing or repeating what
you already know - here are my 2 cents. Feel free to ask me again if I did
not understand your questions:

1. ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH is currently x86 specific. There is a
recent patch that want to use it for arm64 as well [1]. The assumption that
Alistair cited from the code (regarding should_defer_flush()) might not be
applicable to certain architectures (although most likely it is). I tried
to encapsulate the logic on whether an unflushed RO entry can become dirty
in an arch specific function [2].

2. Having said all of that, using the logic of “flush if there are pending
TLB flushes for this mm” as done by UNMAP_TLB_FLUSH makes sense IMHO
(although I would have considered doing it in finer granularity of
VMA/page-table as I proposed before and got somewhat lukewarm response [3]).

3. There is no question that flushing after dropping the ptl is wrong. But
reading the thread, I think that you only focus on whether a dirty
indication might get lost. The problem, I think, is bigger, as it might also
cause correction problems after concurrently removing mappings. What happens
if we get for a clean PTE something like:

  CPU0					CPU1
  ----					----

  migrate_vma_collect_pmd()
  [ defer flush, release ptl ]
					madvise(MADV_DONTNEED)
					-> zap_pte_range()
					[ PTE not present; mmu_gather
					  not updated ]
					
					[ no flush; stale PTE in TLB ]

					[ page is still accessible ]

[ might apply to munmap(); I usually regard MADV_DONTNEED since it does
  not call mmap_write_lock() ]

4. Having multiple TLB flushing infrastructures makes all of these
discussions very complicated and unmaintainable. I need to convince myself
in every occasion (including this one) whether calls to
flush_tlb_batched_pending() and tlb_flush_pending() are needed or not.

What I would like to have [3] is a single infrastructure that gets a
“ticket” (generation when the batching started), the old PTE and the new PTE
and checks whether a TLB flush is needed based on the arch behavior and the
current TLB generation. If needed, it would update the “ticket” to the new
generation. Andy wanted a ring for pending TLB flushes, but I think it is an
overkill with more overhead and complexity than needed.

But the current situation in which every TLB flush is a basis for long
discussions and prone to bugs is impossible.

I hope it helps. Let me know if you want me to revive the patch-set or other
feedback.

[1] https://lore.kernel.org/all/20220711034615.482895-5-21cnbao@gmail.com/
[2] https://lore.kernel.org/all/20220718120212.3180-13-namit@vmware.com/
[3] https://lore.kernel.org/all/20210131001132.3368247-16-namit@vmware.com/
Peter Xu Aug. 17, 2022, 7:07 p.m. UTC | #9
On Wed, Aug 17, 2022 at 03:41:16PM +1000, Alistair Popple wrote:
> My primary concern with batching is ensuring a CPU write after clearing
> a clean PTE but before flushing the TLB does the "right thing" (ie. faults
> if the PTE is not present).

Fair enough.  Exactly I have that same concern. But I think Nadav replied
very recently on this in the previous thread, quotting from him [1]:

  I keep not remembering this erratum correctly. IIRC, the erratum says
  that the access/dirty might be set, but it does not mean that a write is
  possible after the PTE is cleared (i.e., the dirty/access might be set on
  the non-present PTE, but the access itself would fail). So it is not an
  issue in this case - losing A/D would not impact correctness since the
  access should fail.

I don't really know whether he means this, but I really think the hardware
should behave like that or otherwise I can't see how it can go right.

Let's assume if after pte cleared the page can still be written, then
afaict ptep_clear_flush() is not safe either, because fundamentally it is
two operations happening in sequence, of: (1) ptep_get_and_clear(), and (2)
conditionally do flush_tlb_page() when needed.

If page can be written with TLB cached but without pte present, what if
some process writes to memory during step (1) and (2)?  AFAIU that's the
same question as using raw ptep_get_and_clear() and a batched tlb flush.

IOW, I don't see how a tlb batched solution can be worse than using per-pte
ptep_clear_flush().  It may enlarge the race window but fundamentally
(iiuc) they're the same thing here as long as there's no atomic way to both
"clear pte and flush tlb".

[1] https://lore.kernel.org/lkml/E37036E0-566E-40C7-AD15-720CDB003227@gmail.com/
Peter Xu Aug. 17, 2022, 7:27 p.m. UTC | #10
On Wed, Aug 17, 2022 at 02:41:19AM -0700, Nadav Amit wrote:
> 4. Having multiple TLB flushing infrastructures makes all of these
> discussions very complicated and unmaintainable. I need to convince myself
> in every occasion (including this one) whether calls to
> flush_tlb_batched_pending() and tlb_flush_pending() are needed or not.
> 
> What I would like to have [3] is a single infrastructure that gets a
> “ticket” (generation when the batching started), the old PTE and the new PTE
> and checks whether a TLB flush is needed based on the arch behavior and the
> current TLB generation. If needed, it would update the “ticket” to the new
> generation. Andy wanted a ring for pending TLB flushes, but I think it is an
> overkill with more overhead and complexity than needed.
> 
> But the current situation in which every TLB flush is a basis for long
> discussions and prone to bugs is impossible.
> 
> I hope it helps. Let me know if you want me to revive the patch-set or other
> feedback.
> 
> [1] https://lore.kernel.org/all/20220711034615.482895-5-21cnbao@gmail.com/
> [2] https://lore.kernel.org/all/20220718120212.3180-13-namit@vmware.com/
> [3] https://lore.kernel.org/all/20210131001132.3368247-16-namit@vmware.com/

I need more reading on tlb code and also [3] which looks useful to me.
It's definitely sad to make tlb flushing so complicated.  It'll be great if
things can be sorted out someday.

In this specific case, the only way to do safe tlb batching in my mind is:

	pte_offset_map_lock();
	arch_enter_lazy_mmu_mode();
        // If any pending tlb, do it now
        if (mm_tlb_flush_pending())
		flush_tlb_range(vma, start, end);
        else
                flush_tlb_batched_pending();
        loop {
                ...
                pte = ptep_get_and_clear();
                ...
                if (pte_present())
                        unmapped++;
                ...
        }
	if (unmapped)
		flush_tlb_range(walk->vma, start, end);
	arch_leave_lazy_mmu_mode();
	pte_unmap_unlock();

I may miss something, but even if not it already doesn't look pretty.

Thanks,
Huang, Ying Aug. 18, 2022, 5:59 a.m. UTC | #11
Nadav Amit <nadav.amit@gmail.com> writes:

> On Aug 17, 2022, at 12:17 AM, Huang, Ying <ying.huang@intel.com> wrote:
>
>> Alistair Popple <apopple@nvidia.com> writes:
>> 
>>> Peter Xu <peterx@redhat.com> writes:
>>> 
>>>> On Wed, Aug 17, 2022 at 11:49:03AM +1000, Alistair Popple wrote:
>>>>> Peter Xu <peterx@redhat.com> writes:
>>>>> 
>>>>>> On Tue, Aug 16, 2022 at 04:10:29PM +0800, huang ying wrote:
>>>>>>>> @@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
>>>>>>>>                        bool anon_exclusive;
>>>>>>>>                        pte_t swp_pte;
>>>>>>>> 
>>>>>>>> +                       flush_cache_page(vma, addr, pte_pfn(*ptep));
>>>>>>>> +                       pte = ptep_clear_flush(vma, addr, ptep);
>>>>>>> 
>>>>>>> Although I think it's possible to batch the TLB flushing just before
>>>>>>> unlocking PTL.  The current code looks correct.
>>>>>> 
>>>>>> If we're with unconditionally ptep_clear_flush(), does it mean we should
>>>>>> probably drop the "unmapped" and the last flush_tlb_range() already since
>>>>>> they'll be redundant?
>>>>> 
>>>>> This patch does that, unless I missed something?
>>>> 
>>>> Yes it does.  Somehow I didn't read into the real v2 patch, sorry!
>>>> 
>>>>>> If that'll need to be dropped, it looks indeed better to still keep the
>>>>>> batch to me but just move it earlier (before unlock iiuc then it'll be
>>>>>> safe), then we can keep using ptep_get_and_clear() afaiu but keep "pte"
>>>>>> updated.
>>>>> 
>>>>> I think we would also need to check should_defer_flush(). Looking at
>>>>> try_to_unmap_one() there is this comment:
>>>>> 
>>>>> 			if (should_defer_flush(mm, flags) && !anon_exclusive) {
>>>>> 				/*
>>>>> 				 * We clear the PTE but do not flush so potentially
>>>>> 				 * a remote CPU could still be writing to the folio.
>>>>> 				 * If the entry was previously clean then the
>>>>> 				 * architecture must guarantee that a clear->dirty
>>>>> 				 * transition on a cached TLB entry is written through
>>>>> 				 * and traps if the PTE is unmapped.
>>>>> 				 */
>>>>> 
>>>>> And as I understand it we'd need the same guarantee here. Given
>>>>> try_to_migrate_one() doesn't do batched TLB flushes either I'd rather
>>>>> keep the code as consistent as possible between
>>>>> migrate_vma_collect_pmd() and try_to_migrate_one(). I could look at
>>>>> introducing TLB flushing for both in some future patch series.
>>>> 
>>>> should_defer_flush() is TTU-specific code?
>>> 
>>> I'm not sure, but I think we need the same guarantee here as mentioned
>>> in the comment otherwise we wouldn't see a subsequent CPU write that
>>> could dirty the PTE after we have cleared it but before the TLB flush.
>>> 
>>> My assumption was should_defer_flush() would ensure we have that
>>> guarantee from the architecture, but maybe there are alternate/better
>>> ways of enforcing that?
>>>> IIUC the caller sets TTU_BATCH_FLUSH showing that tlb can be omitted since
>>>> the caller will be responsible for doing it.  In migrate_vma_collect_pmd()
>>>> iiuc we don't need that hint because it'll be flushed within the same
>>>> function but just only after the loop of modifying the ptes.  Also it'll be
>>>> with the pgtable lock held.
>>> 
>>> Right, but the pgtable lock doesn't protect against HW PTE changes such
>>> as setting the dirty bit so we need to ensure the HW does the right
>>> thing here and I don't know if all HW does.
>> 
>> This sounds sensible.  But I take a look at zap_pte_range(), and find
>> that it appears that the implementation requires the PTE dirty bit to be
>> write-through.  Do I miss something?
>> 
>> Hi, Nadav, Can you help?
>
> Sorry for joining the discussion late. I read most ofthis thread and I hope
> I understand what you ask me. So at the risk of rehashing or repeating what
> you already know - here are my 2 cents. Feel free to ask me again if I did
> not understand your questions:
>
> 1. ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH is currently x86 specific. There is a
> recent patch that want to use it for arm64 as well [1]. The assumption that
> Alistair cited from the code (regarding should_defer_flush()) might not be
> applicable to certain architectures (although most likely it is). I tried
> to encapsulate the logic on whether an unflushed RO entry can become dirty
> in an arch specific function [2].
>
> 2. Having said all of that, using the logic of “flush if there are pending
> TLB flushes for this mm” as done by UNMAP_TLB_FLUSH makes sense IMHO
> (although I would have considered doing it in finer granularity of
> VMA/page-table as I proposed before and got somewhat lukewarm response [3]).
>
> 3. There is no question that flushing after dropping the ptl is wrong. But
> reading the thread, I think that you only focus on whether a dirty
> indication might get lost. The problem, I think, is bigger, as it might also
> cause correction problems after concurrently removing mappings. What happens
> if we get for a clean PTE something like:
>
>   CPU0					CPU1
>   ----					----
>
>   migrate_vma_collect_pmd()
>   [ defer flush, release ptl ]
> 					madvise(MADV_DONTNEED)
> 					-> zap_pte_range()
> 					[ PTE not present; mmu_gather
> 					  not updated ]
> 					
> 					[ no flush; stale PTE in TLB ]
>
> 					[ page is still accessible ]
>
> [ might apply to munmap(); I usually regard MADV_DONTNEED since it does
>   not call mmap_write_lock() ]

Yes.  You are right.  Flushing after PTL unlocking can cause more
serious problems.

I also want to check whether the dirty bit can be lost in
zap_pte_range(), where the TLB flush will be delayed if the PTE is
clean.  Per my understanding, PTE dirty bit must be write-through to
make this work correctly.  And I cannot imagine how CPU can do except
page fault if

- PTE is non-present
- TLB entry is clean
- CPU writes the page

Then, can we assume PTE dirty bit are always write-through on any
architecture?

> 4. Having multiple TLB flushing infrastructures makes all of these
> discussions very complicated and unmaintainable. I need to convince myself
> in every occasion (including this one) whether calls to
> flush_tlb_batched_pending() and tlb_flush_pending() are needed or not.
>
> What I would like to have [3] is a single infrastructure that gets a
> “ticket” (generation when the batching started), the old PTE and the new PTE
> and checks whether a TLB flush is needed based on the arch behavior and the
> current TLB generation. If needed, it would update the “ticket” to the new
> generation. Andy wanted a ring for pending TLB flushes, but I think it is an
> overkill with more overhead and complexity than needed.
>
> But the current situation in which every TLB flush is a basis for long
> discussions and prone to bugs is impossible.
>
> I hope it helps. Let me know if you want me to revive the patch-set or other
> feedback.
>
> [1] https://lore.kernel.org/all/20220711034615.482895-5-21cnbao@gmail.com/
> [2] https://lore.kernel.org/all/20220718120212.3180-13-namit@vmware.com/
> [3] https://lore.kernel.org/all/20210131001132.3368247-16-namit@vmware.com/

Haven't looked at this in depth yet.  Will do that.

Best Regards,
Huang, Ying
Huang, Ying Aug. 18, 2022, 6:34 a.m. UTC | #12
Peter Xu <peterx@redhat.com> writes:

> On Wed, Aug 17, 2022 at 02:41:19AM -0700, Nadav Amit wrote:
>> 4. Having multiple TLB flushing infrastructures makes all of these
>> discussions very complicated and unmaintainable. I need to convince myself
>> in every occasion (including this one) whether calls to
>> flush_tlb_batched_pending() and tlb_flush_pending() are needed or not.
>> 
>> What I would like to have [3] is a single infrastructure that gets a
>> “ticket” (generation when the batching started), the old PTE and the new PTE
>> and checks whether a TLB flush is needed based on the arch behavior and the
>> current TLB generation. If needed, it would update the “ticket” to the new
>> generation. Andy wanted a ring for pending TLB flushes, but I think it is an
>> overkill with more overhead and complexity than needed.
>> 
>> But the current situation in which every TLB flush is a basis for long
>> discussions and prone to bugs is impossible.
>> 
>> I hope it helps. Let me know if you want me to revive the patch-set or other
>> feedback.
>> 
>> [1] https://lore.kernel.org/all/20220711034615.482895-5-21cnbao@gmail.com/
>> [2] https://lore.kernel.org/all/20220718120212.3180-13-namit@vmware.com/
>> [3] https://lore.kernel.org/all/20210131001132.3368247-16-namit@vmware.com/
>
> I need more reading on tlb code and also [3] which looks useful to me.
> It's definitely sad to make tlb flushing so complicated.  It'll be great if
> things can be sorted out someday.
>
> In this specific case, the only way to do safe tlb batching in my mind is:
>
> 	pte_offset_map_lock();
> 	arch_enter_lazy_mmu_mode();
>         // If any pending tlb, do it now
>         if (mm_tlb_flush_pending())
> 		flush_tlb_range(vma, start, end);
>         else
>                 flush_tlb_batched_pending();

I don't think we need the above 4 lines.  Because we will flush TLB
before we access the pages.  Can you find any issue if we don't use the
above 4 lines?

Best Regards,
Huang, Ying

>         loop {
>                 ...
>                 pte = ptep_get_and_clear();
>                 ...
>                 if (pte_present())
>                         unmapped++;
>                 ...
>         }
> 	if (unmapped)
> 		flush_tlb_range(walk->vma, start, end);
> 	arch_leave_lazy_mmu_mode();
> 	pte_unmap_unlock();
>
> I may miss something, but even if not it already doesn't look pretty.
>
> Thanks,
Peter Xu Aug. 18, 2022, 2:44 p.m. UTC | #13
On Thu, Aug 18, 2022 at 02:34:45PM +0800, Huang, Ying wrote:
> > In this specific case, the only way to do safe tlb batching in my mind is:
> >
> > 	pte_offset_map_lock();
> > 	arch_enter_lazy_mmu_mode();
> >         // If any pending tlb, do it now
> >         if (mm_tlb_flush_pending())
> > 		flush_tlb_range(vma, start, end);
> >         else
> >                 flush_tlb_batched_pending();
> 
> I don't think we need the above 4 lines.  Because we will flush TLB
> before we access the pages.

Could you elaborate?

> Can you find any issue if we don't use the above 4 lines?

It seems okay to me to leave stall tlb at least within the scope of this
function. It only collects present ptes and flush propoerly for them.  I
don't quickly see any other implications to other not touched ptes - unlike
e.g. mprotect(), there's a strong barrier of not allowing further write
after mprotect() returns.

Still I don't know whether there'll be any side effect of having stall tlbs
in !present ptes because I'm not familiar enough with the private dev swap
migration code.  But I think having them will be safe, even if redundant.

Thanks,
Huang, Ying Aug. 19, 2022, 2:51 a.m. UTC | #14
Peter Xu <peterx@redhat.com> writes:

> On Thu, Aug 18, 2022 at 02:34:45PM +0800, Huang, Ying wrote:
>> > In this specific case, the only way to do safe tlb batching in my mind is:
>> >
>> > 	pte_offset_map_lock();
>> > 	arch_enter_lazy_mmu_mode();
>> >         // If any pending tlb, do it now
>> >         if (mm_tlb_flush_pending())
>> > 		flush_tlb_range(vma, start, end);
>> >         else
>> >                 flush_tlb_batched_pending();
>> 
>> I don't think we need the above 4 lines.  Because we will flush TLB
>> before we access the pages.
>
> Could you elaborate?

As you have said below, we don't use non-present PTEs and flush present
PTEs before we access the pages.

>> Can you find any issue if we don't use the above 4 lines?
>
> It seems okay to me to leave stall tlb at least within the scope of this
> function. It only collects present ptes and flush propoerly for them.  I
> don't quickly see any other implications to other not touched ptes - unlike
> e.g. mprotect(), there's a strong barrier of not allowing further write
> after mprotect() returns.

Yes.  I think so too.

> Still I don't know whether there'll be any side effect of having stall tlbs
> in !present ptes because I'm not familiar enough with the private dev swap
> migration code.  But I think having them will be safe, even if redundant.

I don't think it's a good idea to be redundant.  That may hide the real
issue.

Best Regards,
Huang, Ying
Alistair Popple Aug. 24, 2022, 1:56 a.m. UTC | #15
"Huang, Ying" <ying.huang@intel.com> writes:

> Peter Xu <peterx@redhat.com> writes:
>
>> On Thu, Aug 18, 2022 at 02:34:45PM +0800, Huang, Ying wrote:
>>> > In this specific case, the only way to do safe tlb batching in my mind is:
>>> >
>>> > 	pte_offset_map_lock();
>>> > 	arch_enter_lazy_mmu_mode();
>>> >         // If any pending tlb, do it now
>>> >         if (mm_tlb_flush_pending())
>>> > 		flush_tlb_range(vma, start, end);
>>> >         else
>>> >                 flush_tlb_batched_pending();
>>>
>>> I don't think we need the above 4 lines.  Because we will flush TLB
>>> before we access the pages.

I agree. For migration the TLB flush is only important if the PTE is
present, and in that case we do a TLB flush anyway.

>> Could you elaborate?
>
> As you have said below, we don't use non-present PTEs and flush present
> PTEs before we access the pages.
>
>>> Can you find any issue if we don't use the above 4 lines?
>>
>> It seems okay to me to leave stall tlb at least within the scope of this
>> function. It only collects present ptes and flush propoerly for them.  I
>> don't quickly see any other implications to other not touched ptes - unlike
>> e.g. mprotect(), there's a strong barrier of not allowing further write
>> after mprotect() returns.
>
> Yes.  I think so too.
>
>> Still I don't know whether there'll be any side effect of having stall tlbs
>> in !present ptes because I'm not familiar enough with the private dev swap
>> migration code.  But I think having them will be safe, even if redundant.

What side-effect were you thinking of? I don't see any issue with not
TLB flushing stale device-private TLBs prior to the migration because
they're not accessible anyway and shouldn't be in any TLB.

> I don't think it's a good idea to be redundant.  That may hide the real
> issue.
>
> Best Regards,
> Huang, Ying

Thanks all for the discussion. Having done some more reading I agree
that it's safe to assume HW dirty bits are write-through, so will remove
the ptep_clear_flush() and use ptep_get_and_clear() instead. Will split
out the TLB flushing fix into a separate patch in this series.

 - Alistair
Peter Xu Aug. 24, 2022, 8:25 p.m. UTC | #16
On Wed, Aug 24, 2022 at 11:56:25AM +1000, Alistair Popple wrote:
> >> Still I don't know whether there'll be any side effect of having stall tlbs
> >> in !present ptes because I'm not familiar enough with the private dev swap
> >> migration code.  But I think having them will be safe, even if redundant.
> 
> What side-effect were you thinking of? I don't see any issue with not
> TLB flushing stale device-private TLBs prior to the migration because
> they're not accessible anyway and shouldn't be in any TLB.

Sorry to be misleading, I never meant we must add them.  As I said it's
just that I don't know the code well so I don't know whether it's safe to
not have it.

IIUC it's about whether having stall system-ram stall tlb in other
processor would matter or not here.  E.g. some none pte that this code
collected (boosted both "cpages" and "npages" for a none pte) could have
stall tlb in other cores that makes the page writable there.

When I said I'm not familiar with the code, it's majorly about one thing I
never figured out myself, in that migrate_vma_collect_pmd() has this
optimization to trylock on the page, collect if it succeeded:

  /*
   * Optimize for the common case where page is only mapped once
   * in one process. If we can lock the page, then we can safely
   * set up a special migration page table entry now.
   */
   if (trylock_page(page)) {
          ...
   } else {
          put_page(page);
          mpfn = 0;
   }

But it's kind of against a pure "optimization" in that if trylock failed,
we'll clear the mpfn so the src[i] will be zero at last.  Then will we
directly give up on this page, or will we try to lock_page() again
somewhere?

The future unmap op is also based on this "cpages", not "npages":

	if (args->cpages)
		migrate_vma_unmap(args);

So I never figured out how this code really works.  It'll be great if you
could shed some light to it.

Thanks,
Peter Xu Aug. 24, 2022, 8:48 p.m. UTC | #17
On Wed, Aug 24, 2022 at 04:25:44PM -0400, Peter Xu wrote:
> On Wed, Aug 24, 2022 at 11:56:25AM +1000, Alistair Popple wrote:
> > >> Still I don't know whether there'll be any side effect of having stall tlbs
> > >> in !present ptes because I'm not familiar enough with the private dev swap
> > >> migration code.  But I think having them will be safe, even if redundant.
> > 
> > What side-effect were you thinking of? I don't see any issue with not
> > TLB flushing stale device-private TLBs prior to the migration because
> > they're not accessible anyway and shouldn't be in any TLB.
> 
> Sorry to be misleading, I never meant we must add them.  As I said it's
> just that I don't know the code well so I don't know whether it's safe to
> not have it.
> 
> IIUC it's about whether having stall system-ram stall tlb in other
> processor would matter or not here.  E.g. some none pte that this code
> collected (boosted both "cpages" and "npages" for a none pte) could have
> stall tlb in other cores that makes the page writable there.

For this one, let me give a more detailed example.

It's about whether below could happen:

       thread 1                thread 2                 thread 3
       --------                --------                 --------
                          write to page P (data=P1)
                            (cached TLB writable)
  zap_pte_range()
    pgtable lock
    clear pte for page P
    pgtable unlock
    ...
                                                     migrate_vma_collect
                                                       pte none, npages++, cpages++
                                                       allocate device page
                                                       copy data (with P1)
                                                       map pte as device swap 
                          write to page P again
                          (data updated from P1->P2)
  flush tlb

Then at last from processor side P should have data P2 but actually from
device memory it's P1. Data corrupt.

> 
> When I said I'm not familiar with the code, it's majorly about one thing I
> never figured out myself, in that migrate_vma_collect_pmd() has this
> optimization to trylock on the page, collect if it succeeded:
> 
>   /*
>    * Optimize for the common case where page is only mapped once
>    * in one process. If we can lock the page, then we can safely
>    * set up a special migration page table entry now.
>    */
>    if (trylock_page(page)) {
>           ...
>    } else {
>           put_page(page);
>           mpfn = 0;
>    }
> 
> But it's kind of against a pure "optimization" in that if trylock failed,
> we'll clear the mpfn so the src[i] will be zero at last.  Then will we
> directly give up on this page, or will we try to lock_page() again
> somewhere?
> 
> The future unmap op is also based on this "cpages", not "npages":
> 
> 	if (args->cpages)
> 		migrate_vma_unmap(args);
> 
> So I never figured out how this code really works.  It'll be great if you
> could shed some light to it.
> 
> Thanks,
> 
> -- 
> Peter Xu
Alistair Popple Aug. 25, 2022, 12:42 a.m. UTC | #18
Peter Xu <peterx@redhat.com> writes:

> On Wed, Aug 24, 2022 at 04:25:44PM -0400, Peter Xu wrote:
>> On Wed, Aug 24, 2022 at 11:56:25AM +1000, Alistair Popple wrote:
>> > >> Still I don't know whether there'll be any side effect of having stall tlbs
>> > >> in !present ptes because I'm not familiar enough with the private dev swap
>> > >> migration code.  But I think having them will be safe, even if redundant.
>> >
>> > What side-effect were you thinking of? I don't see any issue with not
>> > TLB flushing stale device-private TLBs prior to the migration because
>> > they're not accessible anyway and shouldn't be in any TLB.
>>
>> Sorry to be misleading, I never meant we must add them.  As I said it's
>> just that I don't know the code well so I don't know whether it's safe to
>> not have it.
>>
>> IIUC it's about whether having stall system-ram stall tlb in other
>> processor would matter or not here.  E.g. some none pte that this code
>> collected (boosted both "cpages" and "npages" for a none pte) could have
>> stall tlb in other cores that makes the page writable there.
>
> For this one, let me give a more detailed example.

Thanks, I would have been completely lost about what you were talking
about without this :-)

> It's about whether below could happen:
>
>        thread 1                thread 2                 thread 3
>        --------                --------                 --------
>                           write to page P (data=P1)
>                             (cached TLB writable)
>   zap_pte_range()
>     pgtable lock
>     clear pte for page P
>     pgtable unlock
>     ...
>                                                      migrate_vma_collect
>                                                        pte none, npages++, cpages++
>                                                        allocate device page
>                                                        copy data (with P1)
>                                                        map pte as device swap
>                           write to page P again
>                           (data updated from P1->P2)
>   flush tlb
>
> Then at last from processor side P should have data P2 but actually from
> device memory it's P1. Data corrupt.

In the above scenario migrate_vma_collect_pmd() will observe pte_none.
This will mark the src_pfn[] array as needing a new zero page which will
be installed by migrate_vma_pages()->migrate_vma_insert_page().

So there is no data to be copied hence there can't be any data
corruption. Remember these are private anonymous pages, so any
zap_pte_range() indicates the data is no longer needed (eg.
MADV_DONTNEED).

>>
>> When I said I'm not familiar with the code, it's majorly about one thing I
>> never figured out myself, in that migrate_vma_collect_pmd() has this
>> optimization to trylock on the page, collect if it succeeded:
>>
>>   /*
>>    * Optimize for the common case where page is only mapped once
>>    * in one process. If we can lock the page, then we can safely
>>    * set up a special migration page table entry now.
>>    */
>>    if (trylock_page(page)) {
>>           ...
>>    } else {
>>           put_page(page);
>>           mpfn = 0;
>>    }
>>
>> But it's kind of against a pure "optimization" in that if trylock failed,
>> we'll clear the mpfn so the src[i] will be zero at last.  Then will we
>> directly give up on this page, or will we try to lock_page() again
>> somewhere?

That comment is out dated. We used to try locking the page again but
that was removed by ab09243aa95a ("mm/migrate.c: remove
MIGRATE_PFN_LOCKED"). See
https://lkml.kernel.org/r/20211025041608.289017-1-apopple@nvidia.com

Will post a clean-up for it.

>> The future unmap op is also based on this "cpages", not "npages":
>>
>> 	if (args->cpages)
>> 		migrate_vma_unmap(args);
>>
>> So I never figured out how this code really works.  It'll be great if you
>> could shed some light to it.
>>
>> Thanks,
>>
>> --
>> Peter Xu
Alistair Popple Aug. 25, 2022, 1:24 a.m. UTC | #19
Alistair Popple <apopple@nvidia.com> writes:

> Peter Xu <peterx@redhat.com> writes:

>>> But it's kind of against a pure "optimization" in that if trylock failed,
>>> we'll clear the mpfn so the src[i] will be zero at last.  Then will we
>>> directly give up on this page, or will we try to lock_page() again
>>> somewhere?
>
> That comment is out dated. We used to try locking the page again but
> that was removed by ab09243aa95a ("mm/migrate.c: remove
> MIGRATE_PFN_LOCKED"). See
> https://lkml.kernel.org/r/20211025041608.289017-1-apopple@nvidia.com
>
> Will post a clean-up for it.

By the way it's still an optimisation because in most cases we can avoid
calling try_to_migrate() and walking the rmap altogether if we install
the migration entries here. But I agree the comment is misleading.

>>> The future unmap op is also based on this "cpages", not "npages":
>>>
>>> 	if (args->cpages)
>>> 		migrate_vma_unmap(args);
>>>
>>> So I never figured out how this code really works.  It'll be great if you
>>> could shed some light to it.
>>>
>>> Thanks,
>>>
>>> --
>>> Peter Xu
Peter Xu Aug. 25, 2022, 2:40 p.m. UTC | #20
On Thu, Aug 25, 2022 at 10:42:41AM +1000, Alistair Popple wrote:
> 
> Peter Xu <peterx@redhat.com> writes:
> 
> > On Wed, Aug 24, 2022 at 04:25:44PM -0400, Peter Xu wrote:
> >> On Wed, Aug 24, 2022 at 11:56:25AM +1000, Alistair Popple wrote:
> >> > >> Still I don't know whether there'll be any side effect of having stall tlbs
> >> > >> in !present ptes because I'm not familiar enough with the private dev swap
> >> > >> migration code.  But I think having them will be safe, even if redundant.
> >> >
> >> > What side-effect were you thinking of? I don't see any issue with not
> >> > TLB flushing stale device-private TLBs prior to the migration because
> >> > they're not accessible anyway and shouldn't be in any TLB.
> >>
> >> Sorry to be misleading, I never meant we must add them.  As I said it's
> >> just that I don't know the code well so I don't know whether it's safe to
> >> not have it.
> >>
> >> IIUC it's about whether having stall system-ram stall tlb in other
> >> processor would matter or not here.  E.g. some none pte that this code
> >> collected (boosted both "cpages" and "npages" for a none pte) could have
> >> stall tlb in other cores that makes the page writable there.
> >
> > For this one, let me give a more detailed example.
> 
> Thanks, I would have been completely lost about what you were talking
> about without this :-)
> 
> > It's about whether below could happen:
> >
> >        thread 1                thread 2                 thread 3
> >        --------                --------                 --------
> >                           write to page P (data=P1)
> >                             (cached TLB writable)
> >   zap_pte_range()
> >     pgtable lock
> >     clear pte for page P
> >     pgtable unlock
> >     ...
> >                                                      migrate_vma_collect
> >                                                        pte none, npages++, cpages++
> >                                                        allocate device page
> >                                                        copy data (with P1)
> >                                                        map pte as device swap
> >                           write to page P again
> >                           (data updated from P1->P2)
> >   flush tlb
> >
> > Then at last from processor side P should have data P2 but actually from
> > device memory it's P1. Data corrupt.
> 
> In the above scenario migrate_vma_collect_pmd() will observe pte_none.
> This will mark the src_pfn[] array as needing a new zero page which will
> be installed by migrate_vma_pages()->migrate_vma_insert_page().
> 
> So there is no data to be copied hence there can't be any data
> corruption. Remember these are private anonymous pages, so any
> zap_pte_range() indicates the data is no longer needed (eg.
> MADV_DONTNEED).

My bad to have provided an example but invalid. :)

So if the trylock in the function is the only way to migrate this page,
then I agree stall tlb is fine.

> 
> >>
> >> When I said I'm not familiar with the code, it's majorly about one thing I
> >> never figured out myself, in that migrate_vma_collect_pmd() has this
> >> optimization to trylock on the page, collect if it succeeded:
> >>
> >>   /*
> >>    * Optimize for the common case where page is only mapped once
> >>    * in one process. If we can lock the page, then we can safely
> >>    * set up a special migration page table entry now.
> >>    */
> >>    if (trylock_page(page)) {
> >>           ...
> >>    } else {
> >>           put_page(page);
> >>           mpfn = 0;
> >>    }
> >>
> >> But it's kind of against a pure "optimization" in that if trylock failed,
> >> we'll clear the mpfn so the src[i] will be zero at last.  Then will we
> >> directly give up on this page, or will we try to lock_page() again
> >> somewhere?
> 
> That comment is out dated. We used to try locking the page again but
> that was removed by ab09243aa95a ("mm/migrate.c: remove
> MIGRATE_PFN_LOCKED"). See
> https://lkml.kernel.org/r/20211025041608.289017-1-apopple@nvidia.com
> 
> Will post a clean-up for it.

That'll help, thanks.
Peter Xu Aug. 25, 2022, 3:04 p.m. UTC | #21
On Thu, Aug 25, 2022 at 11:24:03AM +1000, Alistair Popple wrote:
> By the way it's still an optimisation because in most cases we can avoid
> calling try_to_migrate() and walking the rmap altogether if we install
> the migration entries here. But I agree the comment is misleading.

There's one follow up question I forgot to ask on the trylock thing.  I
figured maybe I should ask out loud since we're at it.

Since migrate_vma_setup() always only use trylock (even if before dropping
the prepare() code), does it mean that it can randomly fail?

I looked at some of the callers, it seems not all of them are ready to
handle that (__kvmppc_svm_page_out() or svm_migrate_vma_to_vram()).  Is it
safe?  Do the callers need to always properly handle that (unless the
migration is only a best-effort, but it seems not always the case).

Besides, since I read the old code of prepare(), I saw this comment:

-		if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
-			/*
-			 * Because we are migrating several pages there can be
-			 * a deadlock between 2 concurrent migration where each
-			 * are waiting on each other page lock.
-			 *
-			 * Make migrate_vma() a best effort thing and backoff
-			 * for any page we can not lock right away.
-			 */
-			if (!trylock_page(page)) {
-				migrate->src[i] = 0;
-				migrate->cpages--;
-				put_page(page);
-				continue;
-			}
-			remap = false;
-			migrate->src[i] |= MIGRATE_PFN_LOCKED;
-		}

I'm a bit curious whether that deadlock mentioned in the comment is
observed in reality?

If the page was scanned in the same address space, logically the lock order
should be guaranteed (if both page A&B, both threads should lock in order).
I think the order can be changed if explicitly did so (e.g. fork() plus
mremap() for anonymous here) but I just want to make sure I get the whole
point of it.

Thanks,
Alistair Popple Aug. 25, 2022, 10:09 p.m. UTC | #22
Peter Xu <peterx@redhat.com> writes:

> On Thu, Aug 25, 2022 at 11:24:03AM +1000, Alistair Popple wrote:
>> By the way it's still an optimisation because in most cases we can avoid
>> calling try_to_migrate() and walking the rmap altogether if we install
>> the migration entries here. But I agree the comment is misleading.
>
> There's one follow up question I forgot to ask on the trylock thing.  I
> figured maybe I should ask out loud since we're at it.
>
> Since migrate_vma_setup() always only use trylock (even if before dropping
> the prepare() code), does it mean that it can randomly fail?

Yes, migration is always best effort and can randomly fail. For example
it can also fail because there are unexpected page references or pins.

> I looked at some of the callers, it seems not all of them are ready to
> handle that (__kvmppc_svm_page_out() or svm_migrate_vma_to_vram()).  Is it
> safe?  Do the callers need to always properly handle that (unless the
> migration is only a best-effort, but it seems not always the case).

Migration is always best effort. Callers need to be prepared to handle
failure of a particular page to migrate, but I could believe not all of
them are.

> Besides, since I read the old code of prepare(), I saw this comment:
>
> -		if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
> -			/*
> -			 * Because we are migrating several pages there can be
> -			 * a deadlock between 2 concurrent migration where each
> -			 * are waiting on each other page lock.
> -			 *
> -			 * Make migrate_vma() a best effort thing and backoff
> -			 * for any page we can not lock right away.
> -			 */
> -			if (!trylock_page(page)) {
> -				migrate->src[i] = 0;
> -				migrate->cpages--;
> -				put_page(page);
> -				continue;
> -			}
> -			remap = false;
> -			migrate->src[i] |= MIGRATE_PFN_LOCKED;
> -		}
>
> I'm a bit curious whether that deadlock mentioned in the comment is
> observed in reality?
>
> If the page was scanned in the same address space, logically the lock order
> should be guaranteed (if both page A&B, both threads should lock in order).
> I think the order can be changed if explicitly did so (e.g. fork() plus
> mremap() for anonymous here) but I just want to make sure I get the whole
> point of it.

You seem to have the point of it. The trylock_page() is to avoid
deadlock, and failure is always an option for migration. Drivers can
always retry if they really need the page to migrate, although success
is never guaranteed. For example the page might be pinned (or have
swap-cache allocated to it, but I'm hoping to at least get that fixed).

> Thanks,
Peter Xu Aug. 25, 2022, 11:36 p.m. UTC | #23
On Fri, Aug 26, 2022 at 08:09:28AM +1000, Alistair Popple wrote:
> > I looked at some of the callers, it seems not all of them are ready to
> > handle that (__kvmppc_svm_page_out() or svm_migrate_vma_to_vram()).  Is it
> > safe?  Do the callers need to always properly handle that (unless the
> > migration is only a best-effort, but it seems not always the case).
> 
> Migration is always best effort. Callers need to be prepared to handle
> failure of a particular page to migrate, but I could believe not all of
> them are.

Ok, I see that ppc list is in the loop, hopefully this issue is aware since
afaict ppc will sigbus when migrate_vma_setup() fails, otoh the svm code
just dumps some device error (and I didn't check upper the stack from there).

> 
> > Besides, since I read the old code of prepare(), I saw this comment:
> >
> > -		if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
> > -			/*
> > -			 * Because we are migrating several pages there can be
> > -			 * a deadlock between 2 concurrent migration where each
> > -			 * are waiting on each other page lock.
> > -			 *
> > -			 * Make migrate_vma() a best effort thing and backoff
> > -			 * for any page we can not lock right away.
> > -			 */
> > -			if (!trylock_page(page)) {
> > -				migrate->src[i] = 0;
> > -				migrate->cpages--;
> > -				put_page(page);
> > -				continue;
> > -			}
> > -			remap = false;
> > -			migrate->src[i] |= MIGRATE_PFN_LOCKED;
> > -		}
> >
> > I'm a bit curious whether that deadlock mentioned in the comment is
> > observed in reality?
> >
> > If the page was scanned in the same address space, logically the lock order
> > should be guaranteed (if both page A&B, both threads should lock in order).
> > I think the order can be changed if explicitly did so (e.g. fork() plus
> > mremap() for anonymous here) but I just want to make sure I get the whole
> > point of it.
> 
> You seem to have the point of it. The trylock_page() is to avoid
> deadlock, and failure is always an option for migration. Drivers can
> always retry if they really need the page to migrate, although success
> is never guaranteed. For example the page might be pinned (or have
> swap-cache allocated to it, but I'm hoping to at least get that fixed).

If so I'd suggest even more straightforward document for either this
trylock() or on the APIs (e.g. for migrate_vma_setup()).  This behavior is
IMHO hiding deep and many people may not realize.  I'll comment in the
comment update patch.

Thanks.
diff mbox series

Patch

diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 27fb37d..e2d09e5 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -7,6 +7,7 @@ 
 #include <linux/export.h>
 #include <linux/memremap.h>
 #include <linux/migrate.h>
+#include <linux/mm.h>
 #include <linux/mm_inline.h>
 #include <linux/mmu_notifier.h>
 #include <linux/oom.h>
@@ -61,7 +62,7 @@  static int migrate_vma_collect_pmd(pmd_t *pmdp,
 	struct migrate_vma *migrate = walk->private;
 	struct vm_area_struct *vma = walk->vma;
 	struct mm_struct *mm = vma->vm_mm;
-	unsigned long addr = start, unmapped = 0;
+	unsigned long addr = start;
 	spinlock_t *ptl;
 	pte_t *ptep;
 
@@ -193,11 +194,10 @@  static int migrate_vma_collect_pmd(pmd_t *pmdp,
 			bool anon_exclusive;
 			pte_t swp_pte;
 
+			flush_cache_page(vma, addr, pte_pfn(*ptep));
+			pte = ptep_clear_flush(vma, addr, ptep);
 			anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
 			if (anon_exclusive) {
-				flush_cache_page(vma, addr, pte_pfn(*ptep));
-				ptep_clear_flush(vma, addr, ptep);
-
 				if (page_try_share_anon_rmap(page)) {
 					set_pte_at(mm, addr, ptep, pte);
 					unlock_page(page);
@@ -205,12 +205,14 @@  static int migrate_vma_collect_pmd(pmd_t *pmdp,
 					mpfn = 0;
 					goto next;
 				}
-			} else {
-				ptep_get_and_clear(mm, addr, ptep);
 			}
 
 			migrate->cpages++;
 
+			/* Set the dirty flag on the folio now the pte is gone. */
+			if (pte_dirty(pte))
+				folio_mark_dirty(page_folio(page));
+
 			/* Setup special migration page table entry */
 			if (mpfn & MIGRATE_PFN_WRITE)
 				entry = make_writable_migration_entry(
@@ -242,9 +244,6 @@  static int migrate_vma_collect_pmd(pmd_t *pmdp,
 			 */
 			page_remove_rmap(page, vma, false);
 			put_page(page);
-
-			if (pte_present(pte))
-				unmapped++;
 		} else {
 			put_page(page);
 			mpfn = 0;
@@ -257,10 +256,6 @@  static int migrate_vma_collect_pmd(pmd_t *pmdp,
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(ptep - 1, ptl);
 
-	/* Only flush the TLB if we actually modified any entries */
-	if (unmapped)
-		flush_tlb_range(walk->vma, start, end);
-
 	return 0;
 }