diff mbox series

[v2,3/5] mmu_notifiers: Call invalidate_range() when invalidating TLBs

Message ID 8f293bb51a423afa71ddc3ba46e9f323ee9ffbc7.1689768831.git-series.apopple@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Invalidate secondary IOMMU TLB on permission upgrade | expand

Commit Message

Alistair Popple July 19, 2023, 12:18 p.m. UTC
The invalidate_range() is going to become an architecture specific mmu
notifier used to keep the TLB of secondary MMUs such as an IOMMU in
sync with the CPU page tables. Currently it is called from separate
code paths to the main CPU TLB invalidations. This can lead to a
secondary TLB not getting invalidated when required and makes it hard
to reason about when exactly the secondary TLB is invalidated.

To fix this move the notifier call to the architecture specific TLB
maintenance functions for architectures that have secondary MMUs
requiring explicit software invalidations.

This fixes a SMMU bug on ARM64. On ARM64 PTE permission upgrades
require a TLB invalidation. This invalidation is done by the
architecutre specific ptep_set_access_flags() which calls
flush_tlb_page() if required. However this doesn't call the notifier
resulting in infinite faults being generated by devices using the SMMU
if it has previously cached a read-only PTE in it's TLB.

Moving the invalidations into the TLB invalidation functions ensures
all invalidations happen at the same time as the CPU invalidation. The
architecture specific flush_tlb_all() routines do not call the
notifier as none of the IOMMUs require this.

Signed-off-by: Alistair Popple <apopple@nvidia.com>
Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
---
 arch/arm64/include/asm/tlbflush.h             | 5 +++++
 arch/powerpc/include/asm/book3s/64/tlbflush.h | 1 +
 arch/powerpc/mm/book3s64/radix_hugetlbpage.c  | 1 +
 arch/powerpc/mm/book3s64/radix_tlb.c          | 6 ++++++
 arch/x86/mm/tlb.c                             | 3 +++
 include/asm-generic/tlb.h                     | 1 -
 6 files changed, 16 insertions(+), 1 deletion(-)

Comments

SeongJae Park July 19, 2023, 10:51 p.m. UTC | #1
Hi Alistair,

On Wed, 19 Jul 2023 22:18:44 +1000 Alistair Popple <apopple@nvidia.com> wrote:

> The invalidate_range() is going to become an architecture specific mmu
> notifier used to keep the TLB of secondary MMUs such as an IOMMU in
> sync with the CPU page tables. Currently it is called from separate
> code paths to the main CPU TLB invalidations. This can lead to a
> secondary TLB not getting invalidated when required and makes it hard
> to reason about when exactly the secondary TLB is invalidated.
> 
> To fix this move the notifier call to the architecture specific TLB
> maintenance functions for architectures that have secondary MMUs
> requiring explicit software invalidations.
> 
> This fixes a SMMU bug on ARM64. On ARM64 PTE permission upgrades
> require a TLB invalidation. This invalidation is done by the
> architecutre specific ptep_set_access_flags() which calls
> flush_tlb_page() if required. However this doesn't call the notifier
> resulting in infinite faults being generated by devices using the SMMU
> if it has previously cached a read-only PTE in it's TLB.
> 
> Moving the invalidations into the TLB invalidation functions ensures
> all invalidations happen at the same time as the CPU invalidation. The
> architecture specific flush_tlb_all() routines do not call the
> notifier as none of the IOMMUs require this.
> 
> Signed-off-by: Alistair Popple <apopple@nvidia.com>
> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>

I found below kernel NULL-dereference issue on latest mm-unstable tree, and
bisect points me to the commit of this patch, namely
75c400f82d347af1307010a3e06f3aa5d831d995.

To reproduce, I use 'stress-ng --bigheap $(nproc)'.  The issue happens as soon
as it starts reclaiming memory.  I didn't dive deep into this yet, but
reporting this issue first, since you might have an idea already.


[   69.824805] BUG: kernel NULL pointer dereference, address: 0000000000000498
[   69.826983] #PF: supervisor read access in kernel mode
[   69.828716] #PF: error_code(0x0000) - not-present page
[   69.830249] PGD 0 P4D 0
[   69.830784] Oops: 0000 [#4] PREEMPT SMP PTI
[   69.831881] CPU: 2 PID: 201 Comm: kworker/u72:2 Tainted: G      D W          6.5.0-rc1+ #311
[   69.834221] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-pr4
[   69.837459] Workqueue: writeback wb_workfn (flush-251:0)
[   69.839422] RIP: 0010:arch_tlbbatch_flush (include/linux/mmu_notifier.h:497 (discriminator 3) arch/x86/mm/tlb.c:1268 (discriminator 3))
[ 69.841140] Code: e8 c5 dd d2 00 bf 01 00 00 00 e8 0b df 05 00 65 8b 05 cc 90 f9 63 85 c0 74 4b 65 48c

Code starting with the faulting instruction
===========================================
   0:   e8 c5 dd d2 00          callq  0xd2ddca
   5:   bf 01 00 00 00          mov    $0x1,%edi
   a:   e8 0b df 05 00          callq  0x5df1a
   f:   65 8b 05 cc 90 f9 63    mov    %gs:0x63f990cc(%rip),%eax        # 0x63f990e2
  16:   85 c0                   test   %eax,%eax
  18:   74 4b                   je     0x65
  1a:   65                      gs
  1b:   8c                      .byte 0x8c
[   69.846840] RSP: 0000:ffffbf77007fb048 EFLAGS: 00010286
[   69.848464] RAX: ffff9bfd81970000 RBX: 0000000000000002 RCX: 0000000000000000
[   69.851016] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000000
[   69.853070] RBP: ffffbf77007fb068 R08: 0000000000000001 R09: 0000000000000000
[   69.855213] R10: ffff9bfd81970f10 R11: 0000000000000000 R12: ffff9bfd81970f10
[   69.857456] R13: ffff9c1bbd4b28c0 R14: 0000000000000003 R15: ffffe75b88f4a808
[   69.860213] FS:  0000000000000000(0000) GS:ffff9c1bbd480000(0000) knlGS:0000000000000000
[   69.862211] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   69.863624] CR2: 0000000000000498 CR3: 00000001112ca000 CR4: 00000000000006e0
[   69.865907] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   69.868146] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   69.870414] Call Trace:
[   69.871222]  <TASK>
[   69.871823] ? show_regs (arch/x86/kernel/dumpstack.c:479)
[   69.872855] ? __die_body (arch/x86/kernel/dumpstack.c:421)
[   69.873733] ? __die (arch/x86/kernel/dumpstack.c:435)
[   69.874576] ? page_fault_oops (arch/x86/mm/fault.c:707)
[   69.875657] ? search_bpf_extables (kernel/bpf/core.c:751)
[   69.876854] ? arch_tlbbatch_flush (include/linux/mmu_notifier.h:497 (discriminator 3) arch/x86/mm/tlb.c:1268 (discriminator 3))
[   69.878146] ? search_exception_tables (kernel/extable.c:64)
[   69.879931] ? kernelmode_fixup_or_oops (arch/x86/mm/fault.c:762)
[   69.881284] ? __bad_area_nosemaphore (arch/x86/mm/fault.c:860)
[   69.882722] ? bad_area_nosemaphore (arch/x86/mm/fault.c:867)
[   69.884276] ? do_user_addr_fault (arch/x86/mm/fault.c:1458)
[   69.885800] ? check_preemption_disabled (lib/smp_processor_id.c:42)
[   69.887428] ? exc_page_fault (arch/x86/include/asm/paravirt.h:695 arch/x86/mm/fault.c:1495 arch/x86/mm/fault.c:1543)
[   69.888435] ? asm_exc_page_fault (arch/x86/include/asm/idtentry.h:570)
[   69.889955] ? arch_tlbbatch_flush (include/linux/mmu_notifier.h:497 (discriminator 3) arch/x86/mm/tlb.c:1268 (discriminator 3))
[   69.891564] ? arch_tlbbatch_flush (arch/x86/include/asm/preempt.h:104 (discriminator 1) arch/x86/mm/tlb.c:1267 (discriminator 1))
[   69.892779] try_to_unmap_flush_dirty (mm/rmap.c:622 mm/rmap.c:632)
[   69.893970] shrink_folio_list (mm/vmscan.c:2015)
[   69.895243] shrink_inactive_list (include/linux/spinlock.h:376 mm/vmscan.c:2616)
[   69.896816] shrink_lruvec (mm/vmscan.c:2855 mm/vmscan.c:6303)
[   69.897952] ? rmqueue_bulk (mm/page_alloc.c:2228)
[   69.899385] shrink_node (mm/vmscan.c:6491 mm/vmscan.c:6524)
[   69.900447] do_try_to_free_pages (mm/vmscan.c:6705 mm/vmscan.c:6825)
[   69.901548] try_to_free_pages (mm/vmscan.c:7060)
[   69.902540] __alloc_pages_slowpath.constprop.0 (include/linux/sched/mm.h:380 mm/page_alloc.c:3716 mm/page_alloc.c:3735 mm/page_alloc.c:4140)
[   69.904307] ? sched_cpu_deactivate (kernel/sched/core.c:9728)
[   69.905427] __alloc_pages (mm/page_alloc.c:4525)
[   69.906735] alloc_pages (mm/mempolicy.c:2284)
[   69.907604] new_slab (mm/slub.c:1866 mm/slub.c:2009 mm/slub.c:2062)
[   69.908414] ? chksum_update (crypto/crc32c_generic.c:88)
[   69.909348] ___slab_alloc (mm/slub.c:3216 (discriminator 3))
[   69.910093] ? ext4_mb_new_blocks (fs/ext4/mballoc.c:5538 fs/ext4/mballoc.c:6129)
[   69.911885] ? __enqueue_entity (kernel/sched/fair.c:646)
[   69.913085] ? x2apic_send_IPI (arch/x86/include/asm/paravirt.h:196 arch/x86/include/asm/paravirt.h:229 arch/x86/include/asm/apic.h:240 arch/x86/kernel/apic/x2apic_phys.c:126 arch/x86/kernel/apic/x2apic_phys.c:48)
[   69.914143] ? native_smp_send_reschedule (arch/x86/kernel/apic/ipi.c:72)
[   69.915777] ? sbitmap_find_bit (lib/sbitmap.c:146 lib/sbitmap.c:178 lib/sbitmap.c:199)
[   69.917011] ? ext4_mb_new_blocks (fs/ext4/mballoc.c:5538 fs/ext4/mballoc.c:6129)
[   69.917996] __slab_alloc.isra.0 (mm/slub.c:3314)
[   69.919309] kmem_cache_alloc (mm/slub.c:3367 mm/slub.c:3460 mm/slub.c:3478 mm/slub.c:3485 mm/slub.c:3494)
[   69.920239] ? ext4_mb_new_blocks (fs/ext4/mballoc.c:5538 fs/ext4/mballoc.c:6129)
[   69.921291] ext4_mb_new_blocks (fs/ext4/mballoc.c:5538 fs/ext4/mballoc.c:6129)
[   69.922496] ? ext4_cache_extents (fs/ext4/extents.c:543 (discriminator 2))
[   69.923591] ext4_ext_map_blocks (fs/ext4/extents.c:4286)
[   69.925039] ext4_map_blocks (fs/ext4/inode.c:621)
[   69.926478] ? kmem_cache_alloc (arch/x86/include/asm/jump_label.h:55 include/linux/memcontrol.h:1777 mm/slab.h:522 mm/slab.h:770 mm/slub.c:3470 mm/slub.c:3478 mm/slub.c:3485 mm/slub.c:3494)
[   69.927813] ? ext4_alloc_io_end_vec (fs/ext4/page-io.c:61)
[   69.928851] ext4_do_writepages (fs/ext4/inode.c:2159 fs/ext4/inode.c:2212 fs/ext4/inode.c:2677)
[   69.930008] ? update_sd_lb_stats.constprop.0 (kernel/sched/fair.c:9501 kernel/sched/fair.c:10163)
[   69.931662] ext4_writepages (fs/ext4/inode.c:2766)
[   69.932728] do_writepages (mm/page-writeback.c:2553)
[   69.933403] ? __wb_calc_thresh (mm/page-writeback.c:875)
[   69.934168] __writeback_single_inode (fs/fs-writeback.c:1603)
[   69.935101] ? _raw_spin_unlock (arch/x86/include/asm/preempt.h:104 include/linux/spinlock_api_smp.h:143 kernel/locking/spinlock.c:186)
[   69.936219] writeback_sb_inodes (fs/fs-writeback.c:1896)
[   69.937408] __writeback_inodes_wb (fs/fs-writeback.c:1966)
[   69.938612] wb_writeback (fs/fs-writeback.c:2072)
[   69.939548] wb_workfn (fs/fs-writeback.c:2142 fs/fs-writeback.c:2230 fs/fs-writeback.c:2257)
[   69.940439] ? __switch_to (arch/x86/include/asm/paravirt.h:300 arch/x86/kernel/process_64.c:583)
[   69.941694] process_one_work (arch/x86/include/asm/jump_label.h:27 include/linux/jump_label.h:207 include/trace/events/workqueue.h:108 kernel/workqueue.c:2602)
[   69.943189] worker_thread (include/linux/list.h:292 kernel/workqueue.c:2749)
[   69.944403] ? __pfx_worker_thread (kernel/workqueue.c:2691)
[   69.945491] kthread (kernel/kthread.c:389)
[   69.946333] ? __pfx_kthread (kernel/kthread.c:342)
[   69.947109] ret_from_fork (arch/x86/entry/entry_64.S:314)
[   69.948097]  </TASK>
[   69.948944] Modules linked in: ppdev input_leds joydev parport_pc parport serio_raw qemu_fw_cfg mac_hi
[   69.958916] Dumping ftrace buffer:
[   69.961197]    (ftrace buffer empty)
[   69.962797] CR2: 0000000000000498
[   69.964431] ---[ end trace 0000000000000000 ]---
[   69.966440] RIP: 0010:arch_tlbbatch_flush (include/linux/mmu_notifier.h:497 (discriminator 3) arch/x86/mm/tlb.c:1268 (discriminator 3))
[ 69.969546] Code: e8 c5 dd d2 00 bf 01 00 00 00 e8 0b df 05 00 65 8b 05 cc 90 f9 63 85 c0 74 4b 65 48c

Code starting with the faulting instruction
===========================================
   0:   e8 c5 dd d2 00          callq  0xd2ddca
   5:   bf 01 00 00 00          mov    $0x1,%edi
   a:   e8 0b df 05 00          callq  0x5df1a
   f:   65 8b 05 cc 90 f9 63    mov    %gs:0x63f990cc(%rip),%eax        # 0x63f990e2
  16:   85 c0                   test   %eax,%eax
  18:   74 4b                   je     0x65
  1a:   65                      gs
  1b:   8c                      .byte 0x8c
[   69.979017] RSP: 0000:ffffbf770005f6d8 EFLAGS: 00010286
[   69.981894] RAX: ffff9bfd8090af80 RBX: 0000000000000003 RCX: 0000000000000000
[   69.985344] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000000
[   69.988499] RBP: ffffbf770005f6f8 R08: 0000000000000001 R09: 0000000000000000
[   69.991677] R10: ffff9bfd8090be90 R11: 0000000000000000 R12: ffff9bfd8090be90
[   69.994926] R13: ffff9c1bbd4f28c0 R14: 0000000000000004 R15: ffffe75b84499d08
[   69.998152] FS:  0000000000000000(0000) GS:ffff9c1bbd480000(0000) knlGS:0000000000000000
[   70.001874] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   70.004419] CR2: 0000000000000498 CR3: 00000001112ca000 CR4: 00000000000006e0
[   70.007517] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   70.010951] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400



Thanks,
SJ

[...]
Alistair Popple July 20, 2023, 12:52 a.m. UTC | #2
SeongJae Park <sj@kernel.org> writes:

> Hi Alistair,
>
> On Wed, 19 Jul 2023 22:18:44 +1000 Alistair Popple <apopple@nvidia.com> wrote:
>
>> The invalidate_range() is going to become an architecture specific mmu
>> notifier used to keep the TLB of secondary MMUs such as an IOMMU in
>> sync with the CPU page tables. Currently it is called from separate
>> code paths to the main CPU TLB invalidations. This can lead to a
>> secondary TLB not getting invalidated when required and makes it hard
>> to reason about when exactly the secondary TLB is invalidated.
>> 
>> To fix this move the notifier call to the architecture specific TLB
>> maintenance functions for architectures that have secondary MMUs
>> requiring explicit software invalidations.
>> 
>> This fixes a SMMU bug on ARM64. On ARM64 PTE permission upgrades
>> require a TLB invalidation. This invalidation is done by the
>> architecutre specific ptep_set_access_flags() which calls
>> flush_tlb_page() if required. However this doesn't call the notifier
>> resulting in infinite faults being generated by devices using the SMMU
>> if it has previously cached a read-only PTE in it's TLB.
>> 
>> Moving the invalidations into the TLB invalidation functions ensures
>> all invalidations happen at the same time as the CPU invalidation. The
>> architecture specific flush_tlb_all() routines do not call the
>> notifier as none of the IOMMUs require this.
>> 
>> Signed-off-by: Alistair Popple <apopple@nvidia.com>
>> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
>
> I found below kernel NULL-dereference issue on latest mm-unstable tree, and
> bisect points me to the commit of this patch, namely
> 75c400f82d347af1307010a3e06f3aa5d831d995.
>
> To reproduce, I use 'stress-ng --bigheap $(nproc)'.  The issue happens as soon
> as it starts reclaiming memory.  I didn't dive deep into this yet, but
> reporting this issue first, since you might have an idea already.

Thanks for the report SJ!

I see the problem - current->mm can (obviously!) be NULL which is what's
leading to the NULL dereference. Instead I think on x86 I need to call
the notifier when adding the invalidate to the tlbbatch in
arch_tlbbatch_add_pending() which is equivalent to what ARM64 does.

The below should fix it. Will do a respin with this.

---

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 837e4a50281a..79c46da919b9 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -4,6 +4,7 @@
 
 #include <linux/mm_types.h>
 #include <linux/sched.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/processor.h>
 #include <asm/cpufeature.h>
@@ -282,6 +283,7 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b
 {
 	inc_mm_tlb_gen(mm);
 	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
 }
 
 static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 0b990fb56b66..2d253919b3e8 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -1265,7 +1265,6 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 
 	put_flush_tlb_info();
 	put_cpu();
-	mmu_notifier_arch_invalidate_secondary_tlbs(current->mm, 0, -1UL);
 }
 
 /*
SeongJae Park July 20, 2023, 1:31 a.m. UTC | #3
On Thu, 20 Jul 2023 10:52:59 +1000 Alistair Popple <apopple@nvidia.com> wrote:

> 
> SeongJae Park <sj@kernel.org> writes:
> 
> > Hi Alistair,
> >
> > On Wed, 19 Jul 2023 22:18:44 +1000 Alistair Popple <apopple@nvidia.com> wrote:
> >
> >> The invalidate_range() is going to become an architecture specific mmu
> >> notifier used to keep the TLB of secondary MMUs such as an IOMMU in
> >> sync with the CPU page tables. Currently it is called from separate
> >> code paths to the main CPU TLB invalidations. This can lead to a
> >> secondary TLB not getting invalidated when required and makes it hard
> >> to reason about when exactly the secondary TLB is invalidated.
> >> 
> >> To fix this move the notifier call to the architecture specific TLB
> >> maintenance functions for architectures that have secondary MMUs
> >> requiring explicit software invalidations.
> >> 
> >> This fixes a SMMU bug on ARM64. On ARM64 PTE permission upgrades
> >> require a TLB invalidation. This invalidation is done by the
> >> architecutre specific ptep_set_access_flags() which calls
> >> flush_tlb_page() if required. However this doesn't call the notifier
> >> resulting in infinite faults being generated by devices using the SMMU
> >> if it has previously cached a read-only PTE in it's TLB.
> >> 
> >> Moving the invalidations into the TLB invalidation functions ensures
> >> all invalidations happen at the same time as the CPU invalidation. The
> >> architecture specific flush_tlb_all() routines do not call the
> >> notifier as none of the IOMMUs require this.
> >> 
> >> Signed-off-by: Alistair Popple <apopple@nvidia.com>
> >> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> >
> > I found below kernel NULL-dereference issue on latest mm-unstable tree, and
> > bisect points me to the commit of this patch, namely
> > 75c400f82d347af1307010a3e06f3aa5d831d995.
> >
> > To reproduce, I use 'stress-ng --bigheap $(nproc)'.  The issue happens as soon
> > as it starts reclaiming memory.  I didn't dive deep into this yet, but
> > reporting this issue first, since you might have an idea already.
> 
> Thanks for the report SJ!
> 
> I see the problem - current->mm can (obviously!) be NULL which is what's
> leading to the NULL dereference. Instead I think on x86 I need to call
> the notifier when adding the invalidate to the tlbbatch in
> arch_tlbbatch_add_pending() which is equivalent to what ARM64 does.
> 
> The below should fix it. Will do a respin with this.

Thank you for this quick reply!  I confirm this fixes my issue.


Tested-by: SeongJae Park <sj@kernel.org>

> 
> ---
> 
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 837e4a50281a..79c46da919b9 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -4,6 +4,7 @@
>  
>  #include <linux/mm_types.h>
>  #include <linux/sched.h>
> +#include <linux/mmu_notifier.h>

Nit.  How about putting it between mm_types.h and sched.h, so that it looks
alphabetically sorted?

>  
>  #include <asm/processor.h>
>  #include <asm/cpufeature.h>
> @@ -282,6 +283,7 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b
>  {
>  	inc_mm_tlb_gen(mm);
>  	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
> +	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
>  }
>  
>  static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 0b990fb56b66..2d253919b3e8 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -1265,7 +1265,6 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
>  
>  	put_flush_tlb_info();
>  	put_cpu();
> -	mmu_notifier_arch_invalidate_secondary_tlbs(current->mm, 0, -1UL);
>  }
>  
>  /*
> 
> 


Thanks,
SJ
Luis Chamberlain July 24, 2023, 6:18 p.m. UTC | #4
Cc'ing fsdevel + xfs folks as this fixes a regression tests with
XFS with generic/176.

On Thu, Jul 20, 2023 at 10:52:59AM +1000, Alistair Popple wrote:
> 
> SeongJae Park <sj@kernel.org> writes:
> 
> > Hi Alistair,
> >
> > On Wed, 19 Jul 2023 22:18:44 +1000 Alistair Popple <apopple@nvidia.com> wrote:
> >
> >> The invalidate_range() is going to become an architecture specific mmu
> >> notifier used to keep the TLB of secondary MMUs such as an IOMMU in
> >> sync with the CPU page tables. Currently it is called from separate
> >> code paths to the main CPU TLB invalidations. This can lead to a
> >> secondary TLB not getting invalidated when required and makes it hard
> >> to reason about when exactly the secondary TLB is invalidated.
> >> 
> >> To fix this move the notifier call to the architecture specific TLB
> >> maintenance functions for architectures that have secondary MMUs
> >> requiring explicit software invalidations.
> >> 
> >> This fixes a SMMU bug on ARM64. On ARM64 PTE permission upgrades
> >> require a TLB invalidation. This invalidation is done by the
> >> architecutre specific ptep_set_access_flags() which calls
> >> flush_tlb_page() if required. However this doesn't call the notifier
> >> resulting in infinite faults being generated by devices using the SMMU
> >> if it has previously cached a read-only PTE in it's TLB.
> >> 
> >> Moving the invalidations into the TLB invalidation functions ensures
> >> all invalidations happen at the same time as the CPU invalidation. The
> >> architecture specific flush_tlb_all() routines do not call the
> >> notifier as none of the IOMMUs require this.
> >> 
> >> Signed-off-by: Alistair Popple <apopple@nvidia.com>
> >> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> >
> > I found below kernel NULL-dereference issue on latest mm-unstable tree, and
> > bisect points me to the commit of this patch, namely
> > 75c400f82d347af1307010a3e06f3aa5d831d995.
> >
> > To reproduce, I use 'stress-ng --bigheap $(nproc)'.  The issue happens as soon
> > as it starts reclaiming memory.  I didn't dive deep into this yet, but
> > reporting this issue first, since you might have an idea already.
> 
> Thanks for the report SJ!
> 
> I see the problem - current->mm can (obviously!) be NULL which is what's
> leading to the NULL dereference. Instead I think on x86 I need to call
> the notifier when adding the invalidate to the tlbbatch in
> arch_tlbbatch_add_pending() which is equivalent to what ARM64 does.
> 
> The below should fix it. Will do a respin with this.
> 
> ---
> 
> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
> index 837e4a50281a..79c46da919b9 100644
> --- a/arch/x86/include/asm/tlbflush.h
> +++ b/arch/x86/include/asm/tlbflush.h
> @@ -4,6 +4,7 @@
>  
>  #include <linux/mm_types.h>
>  #include <linux/sched.h>
> +#include <linux/mmu_notifier.h>
>  
>  #include <asm/processor.h>
>  #include <asm/cpufeature.h>
> @@ -282,6 +283,7 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b
>  {
>  	inc_mm_tlb_gen(mm);
>  	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
> +	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
>  }
>  
>  static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
> index 0b990fb56b66..2d253919b3e8 100644
> --- a/arch/x86/mm/tlb.c
> +++ b/arch/x86/mm/tlb.c
> @@ -1265,7 +1265,6 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
>  
>  	put_flush_tlb_info();
>  	put_cpu();
> -	mmu_notifier_arch_invalidate_secondary_tlbs(current->mm, 0, -1UL);
>  }
>  
>  /*

This patch also fixes a regression introduced on linux-next, the same
crash on arch_tlbbatch_flush() is reproducible with fstests generic/176
on XFS. This patch fixes that regression [0]. This should also close out
the syzbot crash too [1]

[0] https://gist.github.com/mcgrof/b37fc8cf7e6e1b3935242681de1a83e2
[1] https://lore.kernel.org/all/0000000000003afcb4060135a664@google.com/

Tested-by: Luis Chamberlain <mcgrof@kernel.org>

  Luis
Alistair Popple July 25, 2023, 12:20 a.m. UTC | #5
Luis Chamberlain <mcgrof@kernel.org> writes:

>> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
>> index 837e4a50281a..79c46da919b9 100644
>> --- a/arch/x86/include/asm/tlbflush.h
>> +++ b/arch/x86/include/asm/tlbflush.h
>> @@ -4,6 +4,7 @@
>>  
>>  #include <linux/mm_types.h>
>>  #include <linux/sched.h>
>> +#include <linux/mmu_notifier.h>
>>  
>>  #include <asm/processor.h>
>>  #include <asm/cpufeature.h>
>> @@ -282,6 +283,7 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b
>>  {
>>  	inc_mm_tlb_gen(mm);
>>  	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
>> +	mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
>>  }
>>  
>>  static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
>> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
>> index 0b990fb56b66..2d253919b3e8 100644
>> --- a/arch/x86/mm/tlb.c
>> +++ b/arch/x86/mm/tlb.c
>> @@ -1265,7 +1265,6 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
>>  
>>  	put_flush_tlb_info();
>>  	put_cpu();
>> -	mmu_notifier_arch_invalidate_secondary_tlbs(current->mm, 0, -1UL);
>>  }
>>  
>>  /*
>
> This patch also fixes a regression introduced on linux-next, the same
> crash on arch_tlbbatch_flush() is reproducible with fstests generic/176
> on XFS. This patch fixes that regression [0]. This should also close out
> the syzbot crash too [1]
>
> [0] https://gist.github.com/mcgrof/b37fc8cf7e6e1b3935242681de1a83e2
> [1] https://lore.kernel.org/all/0000000000003afcb4060135a664@google.com/
>
> Tested-by: Luis Chamberlain <mcgrof@kernel.org>

Thanks Luis. The above fix/respin is already in yesterdays linux-next
(next-20230724) so hopefully you are no longer seeing issues.

>   Luis
Michael Ellerman July 25, 2023, 3:41 a.m. UTC | #6
Alistair Popple <apopple@nvidia.com> writes:
> The invalidate_range() is going to become an architecture specific mmu
> notifier used to keep the TLB of secondary MMUs such as an IOMMU in
> sync with the CPU page tables. Currently it is called from separate
> code paths to the main CPU TLB invalidations. This can lead to a
> secondary TLB not getting invalidated when required and makes it hard
> to reason about when exactly the secondary TLB is invalidated.
>
> To fix this move the notifier call to the architecture specific TLB
> maintenance functions for architectures that have secondary MMUs
> requiring explicit software invalidations.
>
> This fixes a SMMU bug on ARM64. On ARM64 PTE permission upgrades
> require a TLB invalidation. This invalidation is done by the
> architecutre specific ptep_set_access_flags() which calls
  ^
  architecture
  
> flush_tlb_page() if required. However this doesn't call the notifier
> resulting in infinite faults being generated by devices using the SMMU
> if it has previously cached a read-only PTE in it's TLB.
>
> Moving the invalidations into the TLB invalidation functions ensures
> all invalidations happen at the same time as the CPU invalidation. The
> architecture specific flush_tlb_all() routines do not call the
> notifier as none of the IOMMUs require this.
>
> Signed-off-by: Alistair Popple <apopple@nvidia.com>
> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
> 
...

> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
> index 0bd4866..9724b26 100644
> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
> @@ -752,6 +752,8 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd
>  		return radix__local_flush_hugetlb_page(vma, vmaddr);
>  #endif
>  	radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
> +	mmu_notifier_invalidate_range(vma->vm_mm, vmaddr,
> +						vmaddr + mmu_virtual_psize);
>  }
>  EXPORT_SYMBOL(radix__local_flush_tlb_page);

I think we can skip calling the notifier there? It's explicitly a local flush.

cheers
Alistair Popple July 25, 2023, 5:51 a.m. UTC | #7
Michael Ellerman <mpe@ellerman.id.au> writes:

> Alistair Popple <apopple@nvidia.com> writes:
>> The invalidate_range() is going to become an architecture specific mmu
>> notifier used to keep the TLB of secondary MMUs such as an IOMMU in
>> sync with the CPU page tables. Currently it is called from separate
>> code paths to the main CPU TLB invalidations. This can lead to a
>> secondary TLB not getting invalidated when required and makes it hard
>> to reason about when exactly the secondary TLB is invalidated.
>>
>> To fix this move the notifier call to the architecture specific TLB
>> maintenance functions for architectures that have secondary MMUs
>> requiring explicit software invalidations.
>>
>> This fixes a SMMU bug on ARM64. On ARM64 PTE permission upgrades
>> require a TLB invalidation. This invalidation is done by the
>> arahitecutre specific ptep_set_access_flags() which calls
>   ^
>   architecture

Oh. I'd forgotten to apt install codespell ;-)
  
>> flush_tlb_page() if required. However this doesn't call the notifier
>> resulting in infinite faults being generated by devices using the SMMU
>> if it has previously cached a read-only PTE in it's TLB.
>>
>> Moving the invalidations into the TLB invalidation functions ensures
>> all invalidations happen at the same time as the CPU invalidation. The
>> architecture specific flush_tlb_all() routines do not call the
>> notifier as none of the IOMMUs require this.
>>
>> Signed-off-by: Alistair Popple <apopple@nvidia.com>
>> Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
>> 
> ...
>
>> diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
>> index 0bd4866..9724b26 100644
>> --- a/arch/powerpc/mm/book3s64/radix_tlb.c
>> +++ b/arch/powerpc/mm/book3s64/radix_tlb.c
>> @@ -752,6 +752,8 @@ void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd
>>  		return radix__local_flush_hugetlb_page(vma, vmaddr);
>>  #endif
>>  	radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
>> +	mmu_notifier_invalidate_range(vma->vm_mm, vmaddr,
>> +						vmaddr + mmu_virtual_psize);
>>  }
>>  EXPORT_SYMBOL(radix__local_flush_tlb_page);
>
> I think we can skip calling the notifier there? It's explicitly a local flush.

I suspect you're correct. It's been a while since I last worked on PPC
TLB invalidation code though and it's changed a fair bit since then so
was being conservative and appreciate any comments there. Was worried I
may have missed some clever optimisation that detects a local flush is
all that's needed, but I see OCXL calls mm_context_add_copro() though so
that should be ok. Will respin and drop it.

> cheers
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 3456866..a99349d 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -13,6 +13,7 @@ 
 #include <linux/bitfield.h>
 #include <linux/mm_types.h>
 #include <linux/sched.h>
+#include <linux/mmu_notifier.h>
 #include <asm/cputype.h>
 #include <asm/mmu.h>
 
@@ -252,6 +253,7 @@  static inline void flush_tlb_mm(struct mm_struct *mm)
 	__tlbi(aside1is, asid);
 	__tlbi_user(aside1is, asid);
 	dsb(ish);
+	mmu_notifier_invalidate_range(mm, 0, -1UL);
 }
 
 static inline void __flush_tlb_page_nosync(struct mm_struct *mm,
@@ -263,6 +265,8 @@  static inline void __flush_tlb_page_nosync(struct mm_struct *mm,
 	addr = __TLBI_VADDR(uaddr, ASID(mm));
 	__tlbi(vale1is, addr);
 	__tlbi_user(vale1is, addr);
+	mmu_notifier_invalidate_range(mm, uaddr & PAGE_MASK,
+						(uaddr & PAGE_MASK) + PAGE_SIZE);
 }
 
 static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
@@ -396,6 +400,7 @@  static inline void __flush_tlb_range(struct vm_area_struct *vma,
 		scale++;
 	}
 	dsb(ish);
+	mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 }
 
 static inline void flush_tlb_range(struct vm_area_struct *vma,
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index 0d0c144..dca0477 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -5,6 +5,7 @@ 
 #define MMU_NO_CONTEXT	~0UL
 
 #include <linux/mm_types.h>
+#include <linux/mmu_notifier.h>
 #include <asm/book3s/64/tlbflush-hash.h>
 #include <asm/book3s/64/tlbflush-radix.h>
 
diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
index 5e31955..f3fb49f 100644
--- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
@@ -39,6 +39,7 @@  void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st
 		radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize);
 	else
 		radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize);
+	mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 }
 
 void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c
index 0bd4866..9724b26 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -752,6 +752,8 @@  void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmadd
 		return radix__local_flush_hugetlb_page(vma, vmaddr);
 #endif
 	radix__local_flush_tlb_page_psize(vma->vm_mm, vmaddr, mmu_virtual_psize);
+	mmu_notifier_invalidate_range(vma->vm_mm, vmaddr,
+						vmaddr + mmu_virtual_psize);
 }
 EXPORT_SYMBOL(radix__local_flush_tlb_page);
 
@@ -987,6 +989,7 @@  void radix__flush_tlb_mm(struct mm_struct *mm)
 		}
 	}
 	preempt_enable();
+	mmu_notifier_invalidate_range(mm, 0, -1UL);
 }
 EXPORT_SYMBOL(radix__flush_tlb_mm);
 
@@ -1020,6 +1023,7 @@  static void __flush_all_mm(struct mm_struct *mm, bool fullmm)
 			_tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL);
 	}
 	preempt_enable();
+	mmu_notifier_invalidate_range(mm, 0, -1UL);
 }
 
 void radix__flush_all_mm(struct mm_struct *mm)
@@ -1228,6 +1232,7 @@  static inline void __radix__flush_tlb_range(struct mm_struct *mm,
 	}
 out:
 	preempt_enable();
+	mmu_notifier_invalidate_range(mm, start, end);
 }
 
 void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
@@ -1392,6 +1397,7 @@  static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
 	}
 out:
 	preempt_enable();
+	mmu_notifier_invalidate_range(mm, start, end);
 }
 
 void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 267acf2..c30fbcd 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -10,6 +10,7 @@ 
 #include <linux/debugfs.h>
 #include <linux/sched/smt.h>
 #include <linux/task_work.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -1036,6 +1037,7 @@  void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 
 	put_flush_tlb_info();
 	put_cpu();
+	mmu_notifier_invalidate_range(mm, start, end);
 }
 
 
@@ -1263,6 +1265,7 @@  void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
 
 	put_flush_tlb_info();
 	put_cpu();
+	mmu_notifier_invalidate_range(current->mm, 0, -1UL);
 }
 
 /*
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index b466172..bc32a22 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -456,7 +456,6 @@  static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 		return;
 
 	tlb_flush(tlb);
-	mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
 	__tlb_reset_range(tlb);
 }