Message ID | 1592892828-1934-8-git-send-email-iamjoonsoo.kim@lge.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | clean-up the migration target allocation functions | expand |
On Tue 23-06-20 15:13:47, Joonsoo Kim wrote: > From: Joonsoo Kim <iamjoonsoo.kim@lge.com> > > There is a well-defined migration target allocation callback. > Use it. > > Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Acked-by: Michal Hocko <mhocko@suse.com> > --- > mm/internal.h | 1 - > mm/mempolicy.c | 30 ++++++------------------------ > mm/migrate.c | 8 ++++++-- > 3 files changed, 12 insertions(+), 27 deletions(-) > > diff --git a/mm/internal.h b/mm/internal.h > index fb7f7fe..4f9f6b6 100644 > --- a/mm/internal.h > +++ b/mm/internal.h > @@ -613,7 +613,6 @@ static inline bool is_migrate_highatomic_page(struct page *page) > } > > void setup_zone_pageset(struct zone *zone); > -extern struct page *alloc_new_node_page(struct page *page, unsigned long node); > > struct migration_target_control { > int nid; /* preferred node id */ > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > index a3abf64..85a3f21 100644 > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -1065,28 +1065,6 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist, > return 0; > } > > -/* page allocation callback for NUMA node migration */ > -struct page *alloc_new_node_page(struct page *page, unsigned long node) > -{ > - if (PageHuge(page)) { > - return alloc_huge_page_nodemask( > - page_hstate(compound_head(page)), node, > - NULL, __GFP_THISNODE, false); > - } else if (PageTransHuge(page)) { > - struct page *thp; > - > - thp = alloc_pages_node(node, > - (GFP_TRANSHUGE | __GFP_THISNODE), > - HPAGE_PMD_ORDER); > - if (!thp) > - return NULL; > - prep_transhuge_page(thp); > - return thp; > - } else > - return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE | > - __GFP_THISNODE, 0); > -} > - > /* > * Migrate pages from one node to a target node. > * Returns error or the number of pages not migrated. > @@ -1097,6 +1075,10 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, > nodemask_t nmask; > LIST_HEAD(pagelist); > int err = 0; > + struct migration_target_control mtc = { > + .nid = dest, > + .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, > + }; > > nodes_clear(nmask); > node_set(source, nmask); > @@ -1111,8 +1093,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, > flags | MPOL_MF_DISCONTIG_OK, &pagelist); > > if (!list_empty(&pagelist)) { > - err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest, > - MIGRATE_SYNC, MR_SYSCALL); > + err = migrate_pages(&pagelist, alloc_migration_target, NULL, > + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); > if (err) > putback_movable_pages(&pagelist); > } > diff --git a/mm/migrate.c b/mm/migrate.c > index 7c4cd74..1c943b0 100644 > --- a/mm/migrate.c > +++ b/mm/migrate.c > @@ -1590,9 +1590,13 @@ static int do_move_pages_to_node(struct mm_struct *mm, > struct list_head *pagelist, int node) > { > int err; > + struct migration_target_control mtc = { > + .nid = node, > + .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, > + }; > > - err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, > - MIGRATE_SYNC, MR_SYSCALL); > + err = migrate_pages(pagelist, alloc_migration_target, NULL, > + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); > if (err) > putback_movable_pages(pagelist); > return err; > -- > 2.7.4
On 6/23/20 8:13 AM, js1304@gmail.com wrote: > From: Joonsoo Kim <iamjoonsoo.kim@lge.com> > > There is a well-defined migration target allocation callback. > Use it. > > Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> I like that this removes the wrapper completely.
On Tue 07-07-20 21:20:44, Qian Cai wrote: [...] > migrate_pages() starts failing like this apparently using the new > callback on NUMA systems, > > [ 6147.019063][T45242] LTP: starting move_pages12 > [ 6147.475680][T64921] BUG: unable to handle page fault for address: ffffffffffffffe0 Hmm, this looks like -EPIPE (-32) which is unexpected to say the least. Does the test pass without this patch applied? Also there has been v4 posted just yesterday. Does it suffer from the same problem?
On Tue, 7 Jul 2020, Qian Cai wrote: > On Tue, Jun 23, 2020 at 03:13:47PM +0900, js1304@gmail.com wrote: > > From: Joonsoo Kim <iamjoonsoo.kim@lge.com> > > > > There is a well-defined migration target allocation callback. > > Use it. > > > > Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> > > --- ... > > migrate_pages() starts failing like this apparently using the new > callback on NUMA systems, > > [ 6147.019063][T45242] LTP: starting move_pages12 > [ 6147.475680][T64921] BUG: unable to handle page fault for address: ffffffffffffffe0 > [ 6147.483301][T64921] #PF: supervisor read access in kernel mode > [ 6147.489170][T64921] #PF: error_code(0x0000) - not-present page > [ 6147.495040][T64921] PGD 5df817067 P4D 5df817067 PUD 5df819067 PMD 0 > [ 6147.501438][T64921] Oops: 0000 [#1] SMP KASAN NOPTI > [ 6147.506348][T64921] CPU: 35 PID: 64921 Comm: move_pages12 Tainted: G O 5.8.0-rc4-next-20200707 #1 > [ 6147.516586][T64921] Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 > [ 6147.525866][T64921] RIP: 0010:anon_vma_interval_tree_iter_first+0xa2/0x170 > avc_start_pgoff at mm/interval_tree.c:63 > (inlined by) __anon_vma_interval_tree_iter_first at mm/interval_tree.c:71 > (inlined by) anon_vma_interval_tree_iter_first at mm/interval_tree.c:95 > [ 6147.532787][T64921] Code: 00 00 00 00 fc ff df 48 8b 6d 08 48 8d 7d e0 48 89 f9 48 c1 e9 03 80 3c 01 00 0f 85 b3 00 00 00 48 b8 00 00 00 00 00 fc ff df <48> 8b 6d e0 48 8d bd 98 00 00 00 48 89 f9 48 c1 e9 03 80 3c 01 00 > [ 6147.552370][T64921] RSP: 0018:ffffc9000bfdfa98 EFLAGS: 00010246 > [ 6147.558327][T64921] RAX: dffffc0000000000 RBX: ffff888524019b28 RCX: 1ffffffffffffffc > [ 6147.566205][T64921] RDX: 00000000000003ff RSI: 0000000000000200 RDI: ffffffffffffffe0 > [ 6147.574084][T64921] RBP: 0000000000000000 R08: fffff94002b1c001 R09: fffff94002b1c001 > [ 6147.581962][T64921] R10: ffffea00158e0007 R11: fffff94002b1c000 R12: 0000000000000009 > [ 6147.589839][T64921] R13: ffffea00158e0008 R14: ffffea00158e0000 R15: ffffea00158e0000 > [ 6147.597717][T64921] FS: 00007f329cf18b80(0000) GS:ffff88881f7c0000(0000) knlGS:0000000000000000 > [ 6147.606557][T64921] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 6147.613037][T64921] CR2: ffffffffffffffe0 CR3: 000000081122a000 CR4: 00000000003506e0 > [ 6147.620914][T64921] Call Trace: > [ 6147.624078][T64921] rmap_walk_anon+0x141/0xa30 > rmap_walk_anon at mm/rmap.c:1864 > [ 6147.628639][T64921] try_to_unmap+0x209/0x2d0 > try_to_unmap at mm/rmap.c:1763 > [ 6147.633026][T64921] ? rmap_walk_locked+0x140/0x140 > [ 6147.637936][T64921] ? page_remove_rmap+0x1190/0x1190 > [ 6147.643020][T64921] ? page_not_mapped+0x10/0x10 > [ 6147.647668][T64921] ? page_get_anon_vma+0x290/0x290 > [ 6147.652664][T64921] ? page_mapcount_is_zero+0x10/0x10 > [ 6147.657838][T64921] ? hugetlb_page_mapping_lock_write+0x97/0x180 > [ 6147.663972][T64921] migrate_pages+0x1005/0x1fb0 > unmap_and_move_huge_page at mm/migrate.c:1383 > (inlined by) migrate_pages at mm/migrate.c:1468 > [ 6147.668617][T64921] ? remove_migration_pte+0xac0/0xac0 > [ 6147.673875][T64921] move_pages_and_store_status.isra.47+0xd7/0x1a0 > do_move_pages_to_node at mm/migrate.c:1595 > (inlined by) move_pages_and_store_status at mm/migrate.c:1683 > [ 6147.680181][T64921] ? migrate_pages+0x1fb0/0x1fb0 > [ 6147.685002][T64921] __x64_sys_move_pages+0xa5c/0x1100 > [ 6147.690176][T64921] ? trace_hardirqs_on+0x20/0x1b5 > [ 6147.695084][T64921] ? move_pages_and_store_status.isra.47+0x1a0/0x1a0 > [ 6147.701653][T64921] ? rcu_read_lock_sched_held+0xaa/0xd0 > [ 6147.707088][T64921] ? switch_fpu_return+0x196/0x400 > [ 6147.712083][T64921] ? lockdep_hardirqs_on_prepare+0x38c/0x550 > [ 6147.717954][T64921] ? do_syscall_64+0x24/0x310 > [ 6147.722513][T64921] do_syscall_64+0x5f/0x310 > [ 6147.726897][T64921] ? trace_hardirqs_off+0x12/0x1a0 > [ 6147.731894][T64921] ? asm_exc_page_fault+0x8/0x30 > [ 6147.736714][T64921] entry_SYSCALL_64_after_hwframe+0x44/0xa9 > [ 6147.742495][T64921] RIP: 0033:0x7f329c3fe6ed > [ 6147.746791][T64921] Code: Bad RIP value. > [ 6147.750738][T64921] RSP: 002b:00007fff5b6b5f88 EFLAGS: 00000246 ORIG_RAX: 0000000000000117 > [ 6147.759055][T64921] RAX: ffffffffffffffda RBX: 00007f329cf18af8 RCX: 00007f329c3fe6ed > [ 6147.766933][T64921] RDX: 00000000019b0ee0 RSI: 0000000000000400 RDI: 000000000000fd98 > [ 6147.774809][T64921] RBP: 0000000000000400 R08: 00000000019b3f00 R09: 0000000000000004 > [ 6147.782686][T64921] R10: 00000000019b2ef0 R11: 0000000000000246 R12: 0000000000000400 > [ 6147.790563][T64921] R13: 00000000019b0ee0 R14: 00000000019b2ef0 R15: 00000000019b3f00 > [ 6147.798440][T64921] Modules linked in: vfio_pci vfio_virqfd vfio_iommu_type1 vfio loop kvm_amd ses enclosure kvm irqbypass efivars acpi_cpufreq nls_ascii nls_cp437 vfat fat efivarfs ip_tables x_tables sd_mod smartpqi scsi_transport_sas tg3 mlx5_core libphy firmware_class dm_mirror dm_region_hash dm_log dm_mod [last unloaded: dummy_del_mod] > [ 6147.828701][T64921] CR2: ffffffffffffffe0 > [ 6147.832736][T64921] ---[ end trace 40323b256f1c74a8 ]--- > [ 6147.838083][T64921] RIP: 0010:anon_vma_interval_tree_iter_first+0xa2/0x170 > [ 6147.845001][T64921] Code: 00 00 00 00 fc ff df 48 8b 6d 08 48 8d 7d e0 48 89 f9 48 c1 e9 03 80 3c 01 00 0f 85 b3 00 00 00 48 b8 00 00 00 00 00 fc ff df <48> 8b 6d e0 48 8d bd 98 00 00 00 48 89 f9 48 c1 e9 03 80 3c 01 00 > [ 6147.864583][T64921] RSP: 0018:ffffc9000bfdfa98 EFLAGS: 00010246 > [ 6147.870539][T64921] RAX: dffffc0000000000 RBX: ffff888524019b28 RCX: 1ffffffffffffffc > [ 6147.878417][T64921] RDX: 00000000000003ff RSI: 0000000000000200 RDI: ffffffffffffffe0 > [ 6147.886294][T64921] RBP: 0000000000000000 R08: fffff94002b1c001 R09: fffff94002b1c001 > [ 6147.894172][T64921] R10: ffffea00158e0007 R11: fffff94002b1c000 R12: 0000000000000009 > [ 6147.902049][T64921] R13: ffffea00158e0008 R14: ffffea00158e0000 R15: ffffea00158e0000 > [ 6147.909932][T64921] FS: 00007f329cf18b80(0000) GS:ffff88881f7c0000(0000) knlGS:0000000000000000 > [ 6147.918771][T64921] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 6147.925251][T64921] CR2: ffffffffffffffe0 CR3: 000000081122a000 CR4: 00000000003506e0 > [ 6147.933130][T64921] Kernel panic - not syncing: Fatal exception > [ 6147.939493][T64921] Kernel Offset: 0x28c00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) > [ 6147.951090][T64921] ---[ end Kernel panic - not syncing: Fatal exception ]--- I hit this too, trying LTP (yes, move_pages12) on 5.9-rc8. I could not then reproduce it, and suppose that Qian Cai was also unable to do so. But it's fairly easy to explain, and not at all related to Joonsoo's patch or patchset accused in this thread. The faulting address ffffffffffffffe0 is not an -ESPIPE here, it comes from a mov -0x20(%rcx),%rcx with NULL %rcx in my case: I've been too impatient to unravel the interval tree defining to work out exactly what that corresponds to, but I assume from an anon_vma_interval_tree rb_entry container_of. Mike, j'accuse... your 5.7 commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization"), in which unmap_and_move_huge_page() now passes the TTU_RMAP_LOCKED flag to try_to_unmap(), because it's already holding mapping->i_mmap_rwsem: but that is not the right lock to secure an anon_vma lookup. I intended to send a patch, passing TTU_RMAP_LOCKED only in the !PageAnon case (and, see vma_adjust(), anon_vma lock conveniently nests inside i_mmap_rwsem); but then wondered if i_mmap_rwsem was needed in that case or not, so looked deeper into c0d0381ade79. Hmm, not even you liked it! But the worst of it looks simply unnecessary to me, and I hope can be deleted - but better by you than by me (in particular, you were trying to kill 1) and 2) birds with one stone, and I've always given up on understanding hugetlb's reservations: I suspect that side of it is irrelevant here, but I wouldn't pretend to be sure). How could you ever find a PageAnon page in a vma_shareable() area? It is all rather confusing (vma_shareable() depending on VM_MAYSHARE, whereas is_cow_mapping() on VM_SHARED and VM_MAYWRITE: they have to be studied together with do_mmap()'s vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); (And let me add to the confusion by admitting that, prior to 3.15's cda540ace6a1 "mm: get_user_pages(write,force) refuse to COW in shared areas", maybe it was possible to find a PageAnon there.) But my belief (best confirmed by you running your tests with a suitably placed BUG_ON or WARN_ON) is that you'll never find a PageAnon in a vma_shareable() area, so will never need try_to_unmap() to unshare a pagetable in the PageAnon case, so won't need i_mmap_rwsem for PageAnon there, and _get_hugetlb_page_mapping() (your function that deduces an address_space from an anon_vma) can just be deleted. (And in passing, may I ask what hugetlb_page_mapping_lock_write()'s hpage->_mapcount inc and dec are for? You comment it as a hack, but don't explain what needs that hack, and I don't see it.) It does seem sad to be unsharing the page table in any case of try_to_unmap(): the only reason for that I can see (if more care were taken with TLB flushing) is the mmu_notifier situation - I don't think we have any suitable mmu_notifier to alert all the mms to be affected. Hugh
On 10/7/20 8:21 PM, Hugh Dickins wrote: > On Tue, 7 Jul 2020, Qian Cai wrote: >> On Tue, Jun 23, 2020 at 03:13:47PM +0900, js1304@gmail.com wrote: >>> From: Joonsoo Kim <iamjoonsoo.kim@lge.com> >>> >>> There is a well-defined migration target allocation callback. >>> Use it. >>> >>> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> >>> --- > ... >> >> migrate_pages() starts failing like this apparently using the new >> callback on NUMA systems, >> >> [ 6147.019063][T45242] LTP: starting move_pages12 >> [ 6147.475680][T64921] BUG: unable to handle page fault for address: ffffffffffffffe0 >> [ 6147.483301][T64921] #PF: supervisor read access in kernel mode >> [ 6147.489170][T64921] #PF: error_code(0x0000) - not-present page >> [ 6147.495040][T64921] PGD 5df817067 P4D 5df817067 PUD 5df819067 PMD 0 >> [ 6147.501438][T64921] Oops: 0000 [#1] SMP KASAN NOPTI >> [ 6147.506348][T64921] CPU: 35 PID: 64921 Comm: move_pages12 Tainted: G O 5.8.0-rc4-next-20200707 #1 >> [ 6147.516586][T64921] Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 >> [ 6147.525866][T64921] RIP: 0010:anon_vma_interval_tree_iter_first+0xa2/0x170 >> avc_start_pgoff at mm/interval_tree.c:63 >> (inlined by) __anon_vma_interval_tree_iter_first at mm/interval_tree.c:71 >> (inlined by) anon_vma_interval_tree_iter_first at mm/interval_tree.c:95 >> [ 6147.532787][T64921] Code: 00 00 00 00 fc ff df 48 8b 6d 08 48 8d 7d e0 48 89 f9 48 c1 e9 03 80 3c 01 00 0f 85 b3 00 00 00 48 b8 00 00 00 00 00 fc ff df <48> 8b 6d e0 48 8d bd 98 00 00 00 48 89 f9 48 c1 e9 03 80 3c 01 00 >> [ 6147.552370][T64921] RSP: 0018:ffffc9000bfdfa98 EFLAGS: 00010246 >> [ 6147.558327][T64921] RAX: dffffc0000000000 RBX: ffff888524019b28 RCX: 1ffffffffffffffc >> [ 6147.566205][T64921] RDX: 00000000000003ff RSI: 0000000000000200 RDI: ffffffffffffffe0 >> [ 6147.574084][T64921] RBP: 0000000000000000 R08: fffff94002b1c001 R09: fffff94002b1c001 >> [ 6147.581962][T64921] R10: ffffea00158e0007 R11: fffff94002b1c000 R12: 0000000000000009 >> [ 6147.589839][T64921] R13: ffffea00158e0008 R14: ffffea00158e0000 R15: ffffea00158e0000 >> [ 6147.597717][T64921] FS: 00007f329cf18b80(0000) GS:ffff88881f7c0000(0000) knlGS:0000000000000000 >> [ 6147.606557][T64921] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 >> [ 6147.613037][T64921] CR2: ffffffffffffffe0 CR3: 000000081122a000 CR4: 00000000003506e0 >> [ 6147.620914][T64921] Call Trace: >> [ 6147.624078][T64921] rmap_walk_anon+0x141/0xa30 >> rmap_walk_anon at mm/rmap.c:1864 >> [ 6147.628639][T64921] try_to_unmap+0x209/0x2d0 >> try_to_unmap at mm/rmap.c:1763 >> [ 6147.633026][T64921] ? rmap_walk_locked+0x140/0x140 >> [ 6147.637936][T64921] ? page_remove_rmap+0x1190/0x1190 >> [ 6147.643020][T64921] ? page_not_mapped+0x10/0x10 >> [ 6147.647668][T64921] ? page_get_anon_vma+0x290/0x290 >> [ 6147.652664][T64921] ? page_mapcount_is_zero+0x10/0x10 >> [ 6147.657838][T64921] ? hugetlb_page_mapping_lock_write+0x97/0x180 >> [ 6147.663972][T64921] migrate_pages+0x1005/0x1fb0 >> unmap_and_move_huge_page at mm/migrate.c:1383 >> (inlined by) migrate_pages at mm/migrate.c:1468 >> [ 6147.668617][T64921] ? remove_migration_pte+0xac0/0xac0 >> [ 6147.673875][T64921] move_pages_and_store_status.isra.47+0xd7/0x1a0 >> do_move_pages_to_node at mm/migrate.c:1595 >> (inlined by) move_pages_and_store_status at mm/migrate.c:1683 >> [ 6147.680181][T64921] ? migrate_pages+0x1fb0/0x1fb0 >> [ 6147.685002][T64921] __x64_sys_move_pages+0xa5c/0x1100 >> [ 6147.690176][T64921] ? trace_hardirqs_on+0x20/0x1b5 >> [ 6147.695084][T64921] ? move_pages_and_store_status.isra.47+0x1a0/0x1a0 >> [ 6147.701653][T64921] ? rcu_read_lock_sched_held+0xaa/0xd0 >> [ 6147.707088][T64921] ? switch_fpu_return+0x196/0x400 >> [ 6147.712083][T64921] ? lockdep_hardirqs_on_prepare+0x38c/0x550 >> [ 6147.717954][T64921] ? do_syscall_64+0x24/0x310 >> [ 6147.722513][T64921] do_syscall_64+0x5f/0x310 >> [ 6147.726897][T64921] ? trace_hardirqs_off+0x12/0x1a0 >> [ 6147.731894][T64921] ? asm_exc_page_fault+0x8/0x30 >> [ 6147.736714][T64921] entry_SYSCALL_64_after_hwframe+0x44/0xa9 >> [ 6147.742495][T64921] RIP: 0033:0x7f329c3fe6ed >> [ 6147.746791][T64921] Code: Bad RIP value. >> [ 6147.750738][T64921] RSP: 002b:00007fff5b6b5f88 EFLAGS: 00000246 ORIG_RAX: 0000000000000117 >> [ 6147.759055][T64921] RAX: ffffffffffffffda RBX: 00007f329cf18af8 RCX: 00007f329c3fe6ed >> [ 6147.766933][T64921] RDX: 00000000019b0ee0 RSI: 0000000000000400 RDI: 000000000000fd98 >> [ 6147.774809][T64921] RBP: 0000000000000400 R08: 00000000019b3f00 R09: 0000000000000004 >> [ 6147.782686][T64921] R10: 00000000019b2ef0 R11: 0000000000000246 R12: 0000000000000400 >> [ 6147.790563][T64921] R13: 00000000019b0ee0 R14: 00000000019b2ef0 R15: 00000000019b3f00 >> [ 6147.798440][T64921] Modules linked in: vfio_pci vfio_virqfd vfio_iommu_type1 vfio loop kvm_amd ses enclosure kvm irqbypass efivars acpi_cpufreq nls_ascii nls_cp437 vfat fat efivarfs ip_tables x_tables sd_mod smartpqi scsi_transport_sas tg3 mlx5_core libphy firmware_class dm_mirror dm_region_hash dm_log dm_mod [last unloaded: dummy_del_mod] >> [ 6147.828701][T64921] CR2: ffffffffffffffe0 >> [ 6147.832736][T64921] ---[ end trace 40323b256f1c74a8 ]--- >> [ 6147.838083][T64921] RIP: 0010:anon_vma_interval_tree_iter_first+0xa2/0x170 >> [ 6147.845001][T64921] Code: 00 00 00 00 fc ff df 48 8b 6d 08 48 8d 7d e0 48 89 f9 48 c1 e9 03 80 3c 01 00 0f 85 b3 00 00 00 48 b8 00 00 00 00 00 fc ff df <48> 8b 6d e0 48 8d bd 98 00 00 00 48 89 f9 48 c1 e9 03 80 3c 01 00 >> [ 6147.864583][T64921] RSP: 0018:ffffc9000bfdfa98 EFLAGS: 00010246 >> [ 6147.870539][T64921] RAX: dffffc0000000000 RBX: ffff888524019b28 RCX: 1ffffffffffffffc >> [ 6147.878417][T64921] RDX: 00000000000003ff RSI: 0000000000000200 RDI: ffffffffffffffe0 >> [ 6147.886294][T64921] RBP: 0000000000000000 R08: fffff94002b1c001 R09: fffff94002b1c001 >> [ 6147.894172][T64921] R10: ffffea00158e0007 R11: fffff94002b1c000 R12: 0000000000000009 >> [ 6147.902049][T64921] R13: ffffea00158e0008 R14: ffffea00158e0000 R15: ffffea00158e0000 >> [ 6147.909932][T64921] FS: 00007f329cf18b80(0000) GS:ffff88881f7c0000(0000) knlGS:0000000000000000 >> [ 6147.918771][T64921] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 >> [ 6147.925251][T64921] CR2: ffffffffffffffe0 CR3: 000000081122a000 CR4: 00000000003506e0 >> [ 6147.933130][T64921] Kernel panic - not syncing: Fatal exception >> [ 6147.939493][T64921] Kernel Offset: 0x28c00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) >> [ 6147.951090][T64921] ---[ end Kernel panic - not syncing: Fatal exception ]--- > > I hit this too, trying LTP (yes, move_pages12) on 5.9-rc8. I could not > then reproduce it, and suppose that Qian Cai was also unable to do so. > > But it's fairly easy to explain, and not at all related to Joonsoo's > patch or patchset accused in this thread. > > The faulting address ffffffffffffffe0 is not an -ESPIPE here, > it comes from a mov -0x20(%rcx),%rcx with NULL %rcx in my case: I've > been too impatient to unravel the interval tree defining to work out > exactly what that corresponds to, but I assume from an > anon_vma_interval_tree rb_entry container_of. > > Mike, j'accuse... your 5.7 commit c0d0381ade79 ("hugetlbfs: > use i_mmap_rwsem for more pmd sharing synchronization"), in which > unmap_and_move_huge_page() now passes the TTU_RMAP_LOCKED flag to > try_to_unmap(), because it's already holding mapping->i_mmap_rwsem: > but that is not the right lock to secure an anon_vma lookup. Thanks Hugh! Your analysis is correct and the code in that commit is not correct. I was so focused on the file mapping case, I overlooked (actually introduced) this issue for anon mappings. Let me verify that this indeed is the root cause. However, since move_pages12 migrated anon hugetlb pages it certainly does look to be the case. > I intended to send a patch, passing TTU_RMAP_LOCKED only in the > !PageAnon case (and, see vma_adjust(), anon_vma lock conveniently > nests inside i_mmap_rwsem); but then wondered if i_mmap_rwsem was > needed in that case or not, so looked deeper into c0d0381ade79. > > Hmm, not even you liked it! But the worst of it looks simply > unnecessary to me, and I hope can be deleted - but better by you > than by me (in particular, you were trying to kill 1) and 2) birds > with one stone, and I've always given up on understanding hugetlb's > reservations: I suspect that side of it is irrelevant here, > but I wouldn't pretend to be sure). > > How could you ever find a PageAnon page in a vma_shareable() area? > > It is all rather confusing (vma_shareable() depending on VM_MAYSHARE, > whereas is_cow_mapping() on VM_SHARED and VM_MAYWRITE: they have to > be studied together with do_mmap()'s > vm_flags |= VM_SHARED | VM_MAYSHARE; > if (!(file->f_mode & FMODE_WRITE)) > vm_flags &= ~(VM_MAYWRITE | VM_SHARED); > > (And let me add to the confusion by admitting that, prior to 3.15's > cda540ace6a1 "mm: get_user_pages(write,force) refuse to COW in > shared areas", maybe it was possible to find a PageAnon there.) > > But my belief (best confirmed by you running your tests with a > suitably placed BUG_ON or WARN_ON) is that you'll never find a > PageAnon in a vma_shareable() area, so will never need try_to_unmap() > to unshare a pagetable in the PageAnon case, so won't need i_mmap_rwsem > for PageAnon there, and _get_hugetlb_page_mapping() (your function that > deduces an address_space from an anon_vma) can just be deleted. Yes, it is confusing. Let me look into this. I would be really happy to delete that ugly function. > (And in passing, may I ask what hugetlb_page_mapping_lock_write()'s > hpage->_mapcount inc and dec are for? You comment it as a hack, > but don't explain what needs that hack, and I don't see it.) We are trying to lock the mapping (mapping->i_mmap_rwsem). We know mapping is valid, because we obtained it from page_mapping() and it will remain valid because we have the page locked. Page needs to be unlocked to unmap. However, we have to drop page lock in order to acquire i_mmap_rwsem. Once we drop page lock, mapping could become invalid. So, the code code artifically incs mapcount so that mapping will remain valid when upmapping page. As mentioned above, I hope all this can be removed.
On Thu, 8 Oct 2020, Mike Kravetz wrote: > On 10/7/20 8:21 PM, Hugh Dickins wrote: > > > > Mike, j'accuse... your 5.7 commit c0d0381ade79 ("hugetlbfs: > > use i_mmap_rwsem for more pmd sharing synchronization"), in which > > unmap_and_move_huge_page() now passes the TTU_RMAP_LOCKED flag to > > try_to_unmap(), because it's already holding mapping->i_mmap_rwsem: > > but that is not the right lock to secure an anon_vma lookup. > > Thanks Hugh! Your analysis is correct and the code in that commit is > not correct. I was so focused on the file mapping case, I overlooked > (actually introduced) this issue for anon mappings. > > Let me verify that this indeed is the root cause. However, since > move_pages12 migrated anon hugetlb pages it certainly does look to be > the case. > > > I intended to send a patch, passing TTU_RMAP_LOCKED only in the > > !PageAnon case (and, see vma_adjust(), anon_vma lock conveniently > > nests inside i_mmap_rwsem); but then wondered if i_mmap_rwsem was > > needed in that case or not, so looked deeper into c0d0381ade79. > > > > Hmm, not even you liked it! But the worst of it looks simply > > unnecessary to me, and I hope can be deleted - but better by you > > than by me (in particular, you were trying to kill 1) and 2) birds > > with one stone, and I've always given up on understanding hugetlb's > > reservations: I suspect that side of it is irrelevant here, > > but I wouldn't pretend to be sure). > > > > How could you ever find a PageAnon page in a vma_shareable() area? > > > > It is all rather confusing (vma_shareable() depending on VM_MAYSHARE, > > whereas is_cow_mapping() on VM_SHARED and VM_MAYWRITE: they have to > > be studied together with do_mmap()'s > > vm_flags |= VM_SHARED | VM_MAYSHARE; > > if (!(file->f_mode & FMODE_WRITE)) > > vm_flags &= ~(VM_MAYWRITE | VM_SHARED); > > > > (And let me add to the confusion by admitting that, prior to 3.15's > > cda540ace6a1 "mm: get_user_pages(write,force) refuse to COW in > > shared areas", maybe it was possible to find a PageAnon there.) > > > > But my belief (best confirmed by you running your tests with a > > suitably placed BUG_ON or WARN_ON) is that you'll never find a > > PageAnon in a vma_shareable() area, so will never need try_to_unmap() > > to unshare a pagetable in the PageAnon case, so won't need i_mmap_rwsem > > for PageAnon there, and _get_hugetlb_page_mapping() (your function that > > deduces an address_space from an anon_vma) can just be deleted. > > Yes, it is confusing. Let me look into this. I would be really happy > to delete that ugly function. > > > (And in passing, may I ask what hugetlb_page_mapping_lock_write()'s > > hpage->_mapcount inc and dec are for? You comment it as a hack, > > but don't explain what needs that hack, and I don't see it.) > > We are trying to lock the mapping (mapping->i_mmap_rwsem). We know > mapping is valid, because we obtained it from page_mapping() and it > will remain valid because we have the page locked. Page needs to be > unlocked to unmap. However, we have to drop page lock in order to > acquire i_mmap_rwsem. Once we drop page lock, mapping could become > invalid. So, the code code artifically incs mapcount so that mapping > will remain valid when upmapping page. No, unless you can point me to some other hugetlbfs-does-it-differently (and I didn't see it there in that commit), raising _mapcount does not provide any such protection; but does add the possiblility of a "BUG: Bad page cache" and leak from unaccount_page_cache_page(). Earlier in the day I was trying to work out what to recommend instead, but had to turn aside to something else: I'll try again tomorrow. It's a problem I've faced before in tmpfs, keeping a hold on the mapping while page lock is dropped. Quite awkward: igrab() looks as if it's the right thing to use, but turns out to give no protection against umount. Last time around, I ended up with a stop_eviction count in the shmem inode, which shmem_evict_inode() waits on if necessary. Something like that could be done for hugetlbfs too, but I'd prefer to do it without adding extra, if there is a way. > > As mentioned above, I hope all this can be removed. If you continue to nest page lock inside i_mmap_rwsem for hugetlbfs, then I think that part of hugetlb_page_mapping_lock_write() has to remain. I'd much prefer that hugetlbfs did not reverse the usual nesting, but accept that you had reasons for doing it that way. Hugh
On 10/8/20 10:50 PM, Hugh Dickins wrote: > On Thu, 8 Oct 2020, Mike Kravetz wrote: >> On 10/7/20 8:21 PM, Hugh Dickins wrote: >>> >>> Mike, j'accuse... your 5.7 commit c0d0381ade79 ("hugetlbfs: >>> use i_mmap_rwsem for more pmd sharing synchronization"), in which >>> unmap_and_move_huge_page() now passes the TTU_RMAP_LOCKED flag to >>> try_to_unmap(), because it's already holding mapping->i_mmap_rwsem: >>> but that is not the right lock to secure an anon_vma lookup. >> >> Thanks Hugh! Your analysis is correct and the code in that commit is >> not correct. I was so focused on the file mapping case, I overlooked >> (actually introduced) this issue for anon mappings. >> >> Let me verify that this indeed is the root cause. However, since >> move_pages12 migrated anon hugetlb pages it certainly does look to be >> the case. >> >>> I intended to send a patch, passing TTU_RMAP_LOCKED only in the >>> !PageAnon case (and, see vma_adjust(), anon_vma lock conveniently >>> nests inside i_mmap_rwsem); but then wondered if i_mmap_rwsem was >>> needed in that case or not, so looked deeper into c0d0381ade79. >>> >>> Hmm, not even you liked it! But the worst of it looks simply >>> unnecessary to me, and I hope can be deleted - but better by you >>> than by me (in particular, you were trying to kill 1) and 2) birds >>> with one stone, and I've always given up on understanding hugetlb's >>> reservations: I suspect that side of it is irrelevant here, >>> but I wouldn't pretend to be sure). >>> >>> How could you ever find a PageAnon page in a vma_shareable() area? >>> >>> It is all rather confusing (vma_shareable() depending on VM_MAYSHARE, >>> whereas is_cow_mapping() on VM_SHARED and VM_MAYWRITE: they have to >>> be studied together with do_mmap()'s >>> vm_flags |= VM_SHARED | VM_MAYSHARE; >>> if (!(file->f_mode & FMODE_WRITE)) >>> vm_flags &= ~(VM_MAYWRITE | VM_SHARED); >>> >>> (And let me add to the confusion by admitting that, prior to 3.15's >>> cda540ace6a1 "mm: get_user_pages(write,force) refuse to COW in >>> shared areas", maybe it was possible to find a PageAnon there.) >>> >>> But my belief (best confirmed by you running your tests with a >>> suitably placed BUG_ON or WARN_ON) is that you'll never find a >>> PageAnon in a vma_shareable() area, so will never need try_to_unmap() >>> to unshare a pagetable in the PageAnon case, so won't need i_mmap_rwsem >>> for PageAnon there, and _get_hugetlb_page_mapping() (your function that >>> deduces an address_space from an anon_vma) can just be deleted. >> >> Yes, it is confusing. Let me look into this. I would be really happy >> to delete that ugly function. >> >>> (And in passing, may I ask what hugetlb_page_mapping_lock_write()'s >>> hpage->_mapcount inc and dec are for? You comment it as a hack, >>> but don't explain what needs that hack, and I don't see it.) >> >> We are trying to lock the mapping (mapping->i_mmap_rwsem). We know >> mapping is valid, because we obtained it from page_mapping() and it >> will remain valid because we have the page locked. Page needs to be >> unlocked to unmap. However, we have to drop page lock in order to >> acquire i_mmap_rwsem. Once we drop page lock, mapping could become >> invalid. So, the code code artifically incs mapcount so that mapping >> will remain valid when upmapping page. > > No, unless you can point me to some other hugetlbfs-does-it-differently > (and I didn't see it there in that commit), raising _mapcount does not > provide any such protection; but does add the possiblility of a > "BUG: Bad page cache" and leak from unaccount_page_cache_page(). > > Earlier in the day I was trying to work out what to recommend instead, > but had to turn aside to something else: I'll try again tomorrow. > > It's a problem I've faced before in tmpfs, keeping a hold on the > mapping while page lock is dropped. Quite awkward: igrab() looks as > if it's the right thing to use, but turns out to give no protection > against umount. Last time around, I ended up with a stop_eviction > count in the shmem inode, which shmem_evict_inode() waits on if > necessary. Something like that could be done for hugetlbfs too, > but I'd prefer to do it without adding extra, if there is a way. Thanks. >> >> As mentioned above, I hope all this can be removed. > > If you continue to nest page lock inside i_mmap_rwsem for hugetlbfs, > then I think that part of hugetlb_page_mapping_lock_write() has to > remain. I'd much prefer that hugetlbfs did not reverse the usual > nesting, but accept that you had reasons for doing it that way. Yes, that is necessary with the change to lock order. Yesterday I found another issue/case to handle in the hugetlb COW code caused by the difference in lock nesting. As a result, I am rethinking the decision to change the locking order. The primary reason for changing the lock order was to make the hugetlb page fault code not have to worry about pte pointers changing. The issue is that the pte pointer you get (or create) while walking the table without the page table lock could go away due to unsharing the PMD. We can walk the table again after acquiring the lock and validate and possibly retry. However, the backout code in this area which previously had to deal with truncation as well, was quite fragile and did not work in all corner cases. This was mostly due to lovely huge page reservations. I thought that adding more complexity to the backout code was going to cause more issues. Changing the locking order eliminated the pte pointer race as well as truncation. However, it created new locking issues. :( In parallel to working the locking issue, I am also exploring enhanced backout code to handle all the needed cases.
On Fri, 9 Oct 2020, Mike Kravetz wrote: > On 10/8/20 10:50 PM, Hugh Dickins wrote: > > > > It's a problem I've faced before in tmpfs, keeping a hold on the > > mapping while page lock is dropped. Quite awkward: igrab() looks as > > if it's the right thing to use, but turns out to give no protection > > against umount. Last time around, I ended up with a stop_eviction > > count in the shmem inode, which shmem_evict_inode() waits on if > > necessary. Something like that could be done for hugetlbfs too, > > but I'd prefer to do it without adding extra, if there is a way. > > Thanks. I failed to come up with anything neater than a stop_eviction count in the hugetlbfs inode. But that's not unlike a very special purpose rwsem added into the hugetlbfs inode: and now that you're reconsidering how i_mmap_rwsem got repurposed, perhaps you will end up with an rwsem of your own in the hugetlbfs inode. So I won't distract you with a stop_eviction patch unless you ask: that's easy, what you're looking into is hard - good luck! Hugh > >> > >> As mentioned above, I hope all this can be removed. > > > > If you continue to nest page lock inside i_mmap_rwsem for hugetlbfs, > > then I think that part of hugetlb_page_mapping_lock_write() has to > > remain. I'd much prefer that hugetlbfs did not reverse the usual > > nesting, but accept that you had reasons for doing it that way. > > Yes, that is necessary with the change to lock order. > > Yesterday I found another issue/case to handle in the hugetlb COW code > caused by the difference in lock nesting. As a result, I am rethinking > the decision to change the locking order. > > The primary reason for changing the lock order was to make the hugetlb > page fault code not have to worry about pte pointers changing. The issue > is that the pte pointer you get (or create) while walking the table > without the page table lock could go away due to unsharing the PMD. We > can walk the table again after acquiring the lock and validate and possibly > retry. However, the backout code in this area which previously had to > deal with truncation as well, was quite fragile and did not work in all > corner cases. This was mostly due to lovely huge page reservations. > I thought that adding more complexity to the backout code was going to > cause more issues. Changing the locking order eliminated the pte pointer > race as well as truncation. However, it created new locking issues. :( > > In parallel to working the locking issue, I am also exploring enhanced > backout code to handle all the needed cases. > -- > Mike Kravetz
On 10/9/20 3:23 PM, Hugh Dickins wrote: > On Fri, 9 Oct 2020, Mike Kravetz wrote: >> On 10/8/20 10:50 PM, Hugh Dickins wrote: >>> >>> It's a problem I've faced before in tmpfs, keeping a hold on the >>> mapping while page lock is dropped. Quite awkward: igrab() looks as >>> if it's the right thing to use, but turns out to give no protection >>> against umount. Last time around, I ended up with a stop_eviction >>> count in the shmem inode, which shmem_evict_inode() waits on if >>> necessary. Something like that could be done for hugetlbfs too, >>> but I'd prefer to do it without adding extra, if there is a way. >> >> Thanks. > > I failed to come up with anything neater than a stop_eviction count > in the hugetlbfs inode. > > But that's not unlike a very special purpose rwsem added into the > hugetlbfs inode: and now that you're reconsidering how i_mmap_rwsem > got repurposed, perhaps you will end up with an rwsem of your own in > the hugetlbfs inode. We have this in the Oracle UEK kernel. https://github.com/oracle/linux-uek/commit/89260f55df008bb01c841714fb2ad26c300c8c88 Usage has been expanded to cover more cases. When I proposed the same upstream, the suggestion was to try and use i_mmap_rwsem. That is what I have been trying to do. Certainly, a hugetlbfs specific rwsem is easier to manage and less disruptive. > So I won't distract you with a stop_eviction patch unless you ask: > that's easy, what you're looking into is hard - good luck! Thanks Hugh! >> In parallel to working the locking issue, I am also exploring enhanced >> backout code to handle all the needed cases. I'm now confident that we need this or some other locking in place. Why? I went back to a code base before locking changes. My idea was to check for races and backout changes as necessary. Page fault handling code will do something like this: ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); ... do some stuff, possibly allocate a page ... ptl = huge_pte_lock(h, mm, ptep); Because of pmd sharing, we can not be sure the ptep is valid until after holding the ptl. So, I was going to add something like this after obtaining the ptl. while(ptep != huge_pte_offset(mm, haddr, huge_page_size(h))) { spin_unlock(ptl); ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); if (!ptep) { ret = VM_FAULT_OOM; goto backout_unlocked; } ptl = huge_pte_lock(h, mm, ptep); } Unfortunately, the problem is worse than I thought. I knew the PMD pointed to by the ptep could be unshared before attempting to acquire the ptl. However, it is possible that the page which was the PMD may be repurposed before we even try to acquire the ptl. Since the ptl is a spinlock within the struct page of what was the PMD, it may no longer be valid. I actually experienced addressing exceptions in the spinlock code within huge_pte_lock. :( So, it seems that we need some way to prevent PMD unsharing between the huge_pte_alloc() and huge_pte_lock(). It is going to have to be i_mmap_rwsem or some other rwsem.
diff --git a/mm/internal.h b/mm/internal.h index fb7f7fe..4f9f6b6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -613,7 +613,6 @@ static inline bool is_migrate_highatomic_page(struct page *page) } void setup_zone_pageset(struct zone *zone); -extern struct page *alloc_new_node_page(struct page *page, unsigned long node); struct migration_target_control { int nid; /* preferred node id */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a3abf64..85a3f21 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1065,28 +1065,6 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist, return 0; } -/* page allocation callback for NUMA node migration */ -struct page *alloc_new_node_page(struct page *page, unsigned long node) -{ - if (PageHuge(page)) { - return alloc_huge_page_nodemask( - page_hstate(compound_head(page)), node, - NULL, __GFP_THISNODE, false); - } else if (PageTransHuge(page)) { - struct page *thp; - - thp = alloc_pages_node(node, - (GFP_TRANSHUGE | __GFP_THISNODE), - HPAGE_PMD_ORDER); - if (!thp) - return NULL; - prep_transhuge_page(thp); - return thp; - } else - return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE | - __GFP_THISNODE, 0); -} - /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. @@ -1097,6 +1075,10 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodemask_t nmask; LIST_HEAD(pagelist); int err = 0; + struct migration_target_control mtc = { + .nid = dest, + .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + }; nodes_clear(nmask); node_set(source, nmask); @@ -1111,8 +1093,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest, - MIGRATE_SYNC, MR_SYSCALL); + err = migrate_pages(&pagelist, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 7c4cd74..1c943b0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1590,9 +1590,13 @@ static int do_move_pages_to_node(struct mm_struct *mm, struct list_head *pagelist, int node) { int err; + struct migration_target_control mtc = { + .nid = node, + .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, + }; - err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, - MIGRATE_SYNC, MR_SYSCALL); + err = migrate_pages(pagelist, alloc_migration_target, NULL, + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(pagelist); return err;