Message ID | 1525403506-6750-1-git-send-email-hejianet@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
[+Hugh Dickins] Cheers, Jia On 5/4/2018 11:11 AM, Jia He Wrote: > In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE > unaligned for rmap_item->address under memory pressure tests(start 20 guests > and run memhog in the host). > > --------------------------begin-------------------------------------- > [ 410.853828] WARNING: CPU: 4 PID: 4641 at > arch/arm64/kvm/../../../virt/kvm/arm/mmu.c:1826 > kvm_age_hva_handler+0xc0/0xc8 > [ 410.864518] Modules linked in: vhost_net vhost tap xt_CHECKSUM > ipt_MASQUERADE nf_nat_masquerade_ipv4 ip6t_rpfilter ipt_REJECT > nf_reject_ipv4 ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink > ebtable_nat ebtable_broute bridge stp llc ip6table_nat nf_conntrack_ipv6 > nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security > ip6table_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 > nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw > ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter > rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi > ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm > ib_ucm ib_umad rdma_cm ib_cm iw_cm mlx5_ib vfat fat ib_uverbs dm_mirror > dm_region_hash ib_core dm_log dm_mod crc32_ce ipmi_ssif sg nfsd > [ 410.935101] auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs > libcrc32c mlx5_core ixgbe mlxfw devlink mdio ahci_platform > libahci_platform qcom_emac libahci hdma hdma_mgmt i2c_qup > [ 410.951369] CPU: 4 PID: 4641 Comm: memhog Tainted: G W > 4.17.0-rc3+ #8 > [ 410.959104] Hardware name: <snip for confidential issues> > [ 410.969791] pstate: 80400005 (Nzcv daif +PAN -UAO) > [ 410.974575] pc : kvm_age_hva_handler+0xc0/0xc8 > [ 410.979012] lr : handle_hva_to_gpa+0xa8/0xe0 > [ 410.983274] sp : ffff801761553290 > [ 410.986581] x29: ffff801761553290 x28: 0000000000000000 > [ 410.991888] x27: 0000000000000002 x26: 0000000000000000 > [ 410.997195] x25: ffff801765430058 x24: ffff0000080b5608 > [ 411.002501] x23: 0000000000000000 x22: ffff8017ccb84000 > [ 411.007807] x21: 0000000003ff0000 x20: ffff8017ccb84000 > [ 411.013113] x19: 000000000000fe00 x18: ffff000008fb3c08 > [ 411.018419] x17: 0000000000000000 x16: 0060001645820bd3 > [ 411.023725] x15: ffff80176aacbc08 x14: 0000000000000000 > [ 411.029031] x13: 0000000000000040 x12: 0000000000000228 > [ 411.034337] x11: 0000000000000000 x10: 0000000000000000 > [ 411.039643] x9 : 0000000000000010 x8 : 0000000000000004 > [ 411.044949] x7 : 0000000000000000 x6 : 00008017f0770000 > [ 411.050255] x5 : 0000fffda59f0200 x4 : 0000000000000000 > [ 411.055561] x3 : 0000000000000000 x2 : 000000000000fe00 > [ 411.060867] x1 : 0000000003ff0000 x0 : 0000000020000000 > [ 411.066173] Call trace: > [ 411.068614] kvm_age_hva_handler+0xc0/0xc8 > [ 411.072703] handle_hva_to_gpa+0xa8/0xe0 > [ 411.076619] kvm_age_hva+0x4c/0xe8 > [ 411.080014] kvm_mmu_notifier_clear_flush_young+0x54/0x98 > [ 411.085408] __mmu_notifier_clear_flush_young+0x6c/0xa0 > [ 411.090627] page_referenced_one+0x154/0x1d8 > [ 411.094890] rmap_walk_ksm+0x12c/0x1d0 > [ 411.098632] rmap_walk+0x94/0xa0 > [ 411.101854] page_referenced+0x194/0x1b0 > [ 411.105770] shrink_page_list+0x674/0xc28 > [ 411.109772] shrink_inactive_list+0x26c/0x5b8 > [ 411.114122] shrink_node_memcg+0x35c/0x620 > [ 411.118211] shrink_node+0x100/0x430 > [ 411.121778] do_try_to_free_pages+0xe0/0x3a8 > [ 411.126041] try_to_free_pages+0xe4/0x230 > [ 411.130045] __alloc_pages_nodemask+0x564/0xdc0 > [ 411.134569] alloc_pages_vma+0x90/0x228 > [ 411.138398] do_anonymous_page+0xc8/0x4d0 > [ 411.142400] __handle_mm_fault+0x4a0/0x508 > [ 411.146489] handle_mm_fault+0xf8/0x1b0 > [ 411.150321] do_page_fault+0x218/0x4b8 > [ 411.154064] do_translation_fault+0x90/0xa0 > [ 411.158239] do_mem_abort+0x68/0xf0 > [ 411.161721] el0_da+0x24/0x28 > ---------------------------end--------------------------------------- > > In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG, > then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned. > Thus it will cause exceptions in handle_hva_to_gpa on arm64. > > This patch fixes it by ignoring(not removing) the low bits of address when > doing rmap_walk_ksm. > > Signed-off-by: jia.he@hxt-semitech.com > --- > v2: refine the codes as suggested by Claudio Imbrenda > > mm/ksm.c | 14 ++++++++++---- > 1 file changed, 10 insertions(+), 4 deletions(-) > > diff --git a/mm/ksm.c b/mm/ksm.c > index e3cbf9a..e6a9640 100644 > --- a/mm/ksm.c > +++ b/mm/ksm.c > @@ -199,6 +199,8 @@ struct rmap_item { > #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ > #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ > #define STABLE_FLAG 0x200 /* is listed from the stable tree */ > +#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG) > + /* to mask all the flags */ > > /* The stable and unstable tree heads */ > static struct rb_root one_stable_tree[1] = { RB_ROOT }; > @@ -2570,10 +2572,15 @@ void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) > anon_vma_lock_read(anon_vma); > anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, > 0, ULONG_MAX) { > + unsigned long addr; > + > cond_resched(); > vma = vmac->vma; > - if (rmap_item->address < vma->vm_start || > - rmap_item->address >= vma->vm_end) > + > + /* Ignore the stable/unstable/sqnr flags */ > + addr = rmap_item->address & ~KSM_FLAG_MASK; > + > + if (addr < vma->vm_start || addr >= vma->vm_end) > continue; > /* > * Initially we examine only the vma which covers this > @@ -2587,8 +2594,7 @@ void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) > if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) > continue; > > - if (!rwc->rmap_one(page, vma, > - rmap_item->address, rwc->arg)) { > + if (!rwc->rmap_one(page, vma, addr, rwc->arg)) { > anon_vma_unlock_read(anon_vma); > return; > }
On Fri, 4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote: > In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE > unaligned for rmap_item->address under memory pressure tests(start 20 guests > and run memhog in the host). > > ... > > In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG, > then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned. > Thus it will cause exceptions in handle_hva_to_gpa on arm64. > > This patch fixes it by ignoring(not removing) the low bits of address when > doing rmap_walk_ksm. > > Signed-off-by: jia.he@hxt-semitech.com I assumed you wanted this patch to be committed as From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I made that change. Please let me know if this was inappropriate. You can do this yourself by adding an explicit From: line to the very start of the patch's email text. Also, a storm of WARN_ONs is pretty poor behaviour. Is that the only misbehaviour which this bug causes? Do you think the fix should be backported into earlier kernels?
Hi Andrew On 5/10/2018 7:31 AM, Andrew Morton Wrote: > On Fri, 4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote: > >> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE >> unaligned for rmap_item->address under memory pressure tests(start 20 guests >> and run memhog in the host). >> >> ... >> >> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG, >> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned. >> Thus it will cause exceptions in handle_hva_to_gpa on arm64. >> >> This patch fixes it by ignoring(not removing) the low bits of address when >> doing rmap_walk_ksm. >> >> Signed-off-by: jia.he@hxt-semitech.com > I assumed you wanted this patch to be committed as > From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I > made that change. Please let me know if this was inappropriate. Thanks, because there is still some issues in our company's mail server. I have to use my gmail mailbox. > > You can do this yourself by adding an explicit From: line to the very > start of the patch's email text. > > Also, a storm of WARN_ONs is pretty poor behaviour. Is that the only > misbehaviour which this bug causes? Do you think the fix should be > backported into earlier kernels? IMO, it should be backported to stable tree, seems that I missed CC to stable tree ;-) the stom of WARN_ONs is very easy for me to reproduce. More than that, I watched a panic (not reproducible) as follows: [35380.805825] page:ffff7fe003742d80 count:-4871 mapcount:-2126053375 mapping: (null) index:0x0 [35380.815024] flags: 0x1fffc00000000000() [35380.818845] raw: 1fffc00000000000 0000000000000000 0000000000000000 ffffecf981470000 [35380.826569] raw: dead000000000100 dead000000000200 ffff8017c001c000 0000000000000000 [35380.834294] page dumped because: nonzero _refcount [35380.839069] Modules linked in: vhost_net vhost tap ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter fcoe libfcoe libfc 8021q garp mrp stp llc scsi_transport_fc openvswitch nf_conntrack_ipv6 nf_nat_ipv6 nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_defrag_ipv6 nf_nat nf_conntrack vfat fat rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm mlx5_ib ib_core crc32_ce ipmi_ssif tpm_tis tpm_tis_core sg nfsd auth_rpcgss nfs_acl lockd grace sunrpc dm_multipath ip_tables xfs libcrc32c mlx5_core mlxfw devlink ahci_platform libahci_platform libahci qcom_emac sdhci_acpi sdhci hdma mmc_core hdma_mgmt i2c_qup dm_mirror dm_region_hash dm_log dm_mod [35380.908341] CPU: 29 PID: 18323 Comm: qemu-kvm Tainted: G W 4.14.15-5.hxt.aarch64 #1 [35380.917107] Hardware name: <snip for confidential issues> [35380.930909] Call trace: [35380.933345] [<ffff000008088f00>] dump_backtrace+0x0/0x22c [35380.938723] [<ffff000008089150>] show_stack+0x24/0x2c [35380.943759] [<ffff00000893c078>] dump_stack+0x8c/0xb0 [35380.948794] [<ffff00000820ab50>] bad_page+0xf4/0x154 [35380.953740] [<ffff000008211ce8>] free_pages_check_bad+0x90/0x9c [35380.959642] [<ffff00000820c430>] free_pcppages_bulk+0x464/0x518 [35380.965545] [<ffff00000820db98>] free_hot_cold_page+0x22c/0x300 [35380.971448] [<ffff0000082176fc>] __put_page+0x54/0x60 [35380.976484] [<ffff0000080b1164>] unmap_stage2_range+0x170/0x2b4 [35380.982385] [<ffff0000080b12d8>] kvm_unmap_hva_handler+0x30/0x40 [35380.988375] [<ffff0000080b0104>] handle_hva_to_gpa+0xb0/0xec [35380.994016] [<ffff0000080b2644>] kvm_unmap_hva_range+0x5c/0xd0 [35380.999833] [<ffff0000080a8054>] kvm_mmu_notifier_invalidate_range_start+0x60/0xb0 [35381.007387] [<ffff000008271f44>] __mmu_notifier_invalidate_range_start+0x64/0x8c [35381.014765] [<ffff0000082547c8>] try_to_unmap_one+0x78c/0x7a4 [35381.020493] [<ffff000008276d04>] rmap_walk_ksm+0x124/0x1a0 [35381.025961] [<ffff0000082551b4>] rmap_walk+0x94/0x98 [35381.030909] [<ffff0000082555e4>] try_to_unmap+0x100/0x124 [35381.036293] [<ffff00000828243c>] unmap_and_move+0x480/0x6fc [35381.041847] [<ffff000008282b6c>] migrate_pages+0x10c/0x288 [35381.047318] [<ffff00000823c164>] compact_zone+0x238/0x954 [35381.052697] [<ffff00000823c944>] compact_zone_order+0xc4/0xe8 [35381.058427] [<ffff00000823d25c>] try_to_compact_pages+0x160/0x294 [35381.064503] [<ffff00000820f074>] __alloc_pages_direct_compact+0x68/0x194 [35381.071187] [<ffff000008210138>] __alloc_pages_nodemask+0xc20/0xf7c [35381.077437] [<ffff0000082709e4>] alloc_pages_vma+0x1a4/0x1c0 [35381.083080] [<ffff000008285b68>] do_huge_pmd_anonymous_page+0x128/0x324 [35381.089677] [<ffff000008248a24>] __handle_mm_fault+0x71c/0x7e8 [35381.095492] [<ffff000008248be8>] handle_mm_fault+0xf8/0x194 [35381.101049] [<ffff000008240dcc>] __get_user_pages+0x124/0x34c [35381.106777] [<ffff000008241870>] populate_vma_page_range+0x90/0x9c [35381.112941] [<ffff000008241940>] __mm_populate+0xc4/0x15c [35381.118322] [<ffff00000824b294>] SyS_mlockall+0x100/0x164 [35381.123705] Exception stack(0xffff800dce5f3ec0 to 0xffff800dce5f4000) [35381.130128] 3ec0: 0000000000000003 d6e6024cc9b87e00 0000aaaabe94f000 0000000000000000 [35381.137940] 3ee0: 0000000000000002 0000000000000000 0000000000000000 0000aaaacf6fc3c0 [35381.145753] 3f00: 00000000000000e6 0000aaaacf6fc490 0000ffffeeeab0f0 d6e6024cc9b87e00 [35381.153565] 3f20: 0000000000000000 0000aaaabe81b3c0 0000000000000020 00009e53eff806b5 [35381.161379] 3f40: 0000aaaabe94de48 0000ffffa7c269b0 0000000000000011 0000ffffeeeabf68 [35381.169190] 3f60: 0000aaaaceacfe60 0000aaaabe94f000 0000aaaabe9ba358 0000aaaabe7ffb80 [35381.177003] 3f80: 0000aaaabe9ba000 0000aaaabe959f64 0000000000000000 0000aaaabe94f000 [35381.184815] 3fa0: 0000000000000000 0000ffffeeeabdb0 0000aaaabe5f3bf8 0000ffffeeeabdb0 [35381.192628] 3fc0: 0000ffffa7c269b8 0000000060000000 0000000000000003 00000000000000e6 [35381.200440] 3fe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [35381.208254] [<ffff00000808339c>] __sys_trace_return+0x0/0x4 [35381.213809] Disabling lock debugging due to kernel taint I ever injected a fault on purpose in kvm_unmap_hva_range by set size=size-0x200, the call trace is similar as above. Thus, I thought the panic is similarly caused by the root cause of WARN_ON
On 10/05/18 02:26, Jia He wrote: > Hi Andrew > > > On 5/10/2018 7:31 AM, Andrew Morton Wrote: >> On Fri, 4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote: >> >>> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE >>> unaligned for rmap_item->address under memory pressure tests(start 20 guests >>> and run memhog in the host). >>> >>> ... >>> >>> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG, >>> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned. >>> Thus it will cause exceptions in handle_hva_to_gpa on arm64. >>> >>> This patch fixes it by ignoring(not removing) the low bits of address when >>> doing rmap_walk_ksm. >>> >>> Signed-off-by: jia.he@hxt-semitech.com >> I assumed you wanted this patch to be committed as >> From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I >> made that change. Please let me know if this was inappropriate. > Thanks, because there is still some issues in our company's mail server. > I have to use my gmail mailbox. >> >> You can do this yourself by adding an explicit From: line to the very >> start of the patch's email text. >> >> Also, a storm of WARN_ONs is pretty poor behaviour. Is that the only >> misbehaviour which this bug causes? Do you think the fix should be >> backported into earlier kernels? > IMO, it should be backported to stable tree, seems that I missed CC to stable tree ;-) > the stom of WARN_ONs is very easy for me to reproduce. > More than that, I watched a panic (not reproducible) as follows: > [35380.805825] page:ffff7fe003742d80 count:-4871 mapcount:-2126053375 mapping: (null) index:0x0 > [35380.815024] flags: 0x1fffc00000000000() > [35380.818845] raw: 1fffc00000000000 0000000000000000 0000000000000000 ffffecf981470000 > [35380.826569] raw: dead000000000100 dead000000000200 ffff8017c001c000 0000000000000000 > [35380.834294] page dumped because: nonzero _refcount > [35380.908341] CPU: 29 PID: 18323 Comm: qemu-kvm Tainted: G W 4.14.15-5.hxt.aarch64 #1 > [35380.917107] Hardware name: <snip for confidential issues> > [35380.930909] Call trace: > [35380.933345] [<ffff000008088f00>] dump_backtrace+0x0/0x22c > [35380.938723] [<ffff000008089150>] show_stack+0x24/0x2c > [35380.943759] [<ffff00000893c078>] dump_stack+0x8c/0xb0 > [35380.948794] [<ffff00000820ab50>] bad_page+0xf4/0x154 > [35380.953740] [<ffff000008211ce8>] free_pages_check_bad+0x90/0x9c > [35380.959642] [<ffff00000820c430>] free_pcppages_bulk+0x464/0x518 > [35380.965545] [<ffff00000820db98>] free_hot_cold_page+0x22c/0x300 > [35380.971448] [<ffff0000082176fc>] __put_page+0x54/0x60 > [35380.976484] [<ffff0000080b1164>] unmap_stage2_range+0x170/0x2b4 > [35380.982385] [<ffff0000080b12d8>] kvm_unmap_hva_handler+0x30/0x40 > [35380.988375] [<ffff0000080b0104>] handle_hva_to_gpa+0xb0/0xec > [35380.994016] [<ffff0000080b2644>] kvm_unmap_hva_range+0x5c/0xd0 > [35380.999833] [<ffff0000080a8054>] kvm_mmu_notifier_invalidate_range_start+0x60/0xb0 > [35381.007387] [<ffff000008271f44>] __mmu_notifier_invalidate_range_start+0x64/0x8c > [35381.014765] [<ffff0000082547c8>] try_to_unmap_one+0x78c/0x7a4 > [35381.020493] [<ffff000008276d04>] rmap_walk_ksm+0x124/0x1a0 > [35381.025961] [<ffff0000082551b4>] rmap_walk+0x94/0x98 > [35381.030909] [<ffff0000082555e4>] try_to_unmap+0x100/0x124 > [35381.036293] [<ffff00000828243c>] unmap_and_move+0x480/0x6fc > [35381.041847] [<ffff000008282b6c>] migrate_pages+0x10c/0x288 > [35381.047318] [<ffff00000823c164>] compact_zone+0x238/0x954 > [35381.052697] [<ffff00000823c944>] compact_zone_order+0xc4/0xe8 > [35381.058427] [<ffff00000823d25c>] try_to_compact_pages+0x160/0x294 > [35381.064503] [<ffff00000820f074>] __alloc_pages_direct_compact+0x68/0x194 > [35381.071187] [<ffff000008210138>] __alloc_pages_nodemask+0xc20/0xf7c > [35381.077437] [<ffff0000082709e4>] alloc_pages_vma+0x1a4/0x1c0 > [35381.083080] [<ffff000008285b68>] do_huge_pmd_anonymous_page+0x128/0x324 > [35381.089677] [<ffff000008248a24>] __handle_mm_fault+0x71c/0x7e8 > [35381.095492] [<ffff000008248be8>] handle_mm_fault+0xf8/0x194 > [35381.101049] [<ffff000008240dcc>] __get_user_pages+0x124/0x34c > [35381.106777] [<ffff000008241870>] populate_vma_page_range+0x90/0x9c > [35381.112941] [<ffff000008241940>] __mm_populate+0xc4/0x15c > [35381.118322] [<ffff00000824b294>] SyS_mlockall+0x100/0x164 > [35381.123705] Exception stack(0xffff800dce5f3ec0 to 0xffff800dce5f4000) > [35381.130128] 3ec0: 0000000000000003 d6e6024cc9b87e00 0000aaaabe94f000 0000000000000000 > [35381.137940] 3ee0: 0000000000000002 0000000000000000 0000000000000000 0000aaaacf6fc3c0 > [35381.145753] 3f00: 00000000000000e6 0000aaaacf6fc490 0000ffffeeeab0f0 d6e6024cc9b87e00 > [35381.153565] 3f20: 0000000000000000 0000aaaabe81b3c0 0000000000000020 00009e53eff806b5 > [35381.161379] 3f40: 0000aaaabe94de48 0000ffffa7c269b0 0000000000000011 0000ffffeeeabf68 > [35381.169190] 3f60: 0000aaaaceacfe60 0000aaaabe94f000 0000aaaabe9ba358 0000aaaabe7ffb80 > [35381.177003] 3f80: 0000aaaabe9ba000 0000aaaabe959f64 0000000000000000 0000aaaabe94f000 > [35381.184815] 3fa0: 0000000000000000 0000ffffeeeabdb0 0000aaaabe5f3bf8 0000ffffeeeabdb0 > [35381.192628] 3fc0: 0000ffffa7c269b8 0000000060000000 0000000000000003 00000000000000e6 > [35381.200440] 3fe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 > [35381.208254] [<ffff00000808339c>] __sys_trace_return+0x0/0x4 > [35381.213809] Disabling lock debugging due to kernel taint > > I ever injected a fault on purpose in kvm_unmap_hva_range by set size=size-0x200, the call trace is similar > as above. Thus, I thought the panic is similarly caused by the root cause of WARN_ON Please could you share your "changes" (that injected the fault) that triggered this Panic and the steps that triggered this ? The only reason we should get there is by trying to put a page that is not owned by the KVM Stage 2 page table either: 1) It was free'd already ? - We has some race conditions there which were fixed. 2) The code tries to access something that doesn't belong there. - If this happens that doesn't look good for a simple change you mentioned. So we would like to know better about the situation to see if there is something we need to address. Suzuki
On 10/05/18 00:31, Andrew Morton wrote: > On Fri, 4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote: > >> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE >> unaligned for rmap_item->address under memory pressure tests(start 20 guests >> and run memhog in the host). >> >> ... >> >> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG, >> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned. >> Thus it will cause exceptions in handle_hva_to_gpa on arm64. >> >> This patch fixes it by ignoring(not removing) the low bits of address when >> doing rmap_walk_ksm. >> >> Signed-off-by: jia.he@hxt-semitech.com > > I assumed you wanted this patch to be committed as > From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I > made that change. Please let me know if this was inappropriate. > > You can do this yourself by adding an explicit From: line to the very > start of the patch's email text. > > Also, a storm of WARN_ONs is pretty poor behaviour. Is that the only > misbehaviour which this bug causes? Do you think the fix should be > backported into earlier kernels? > I think its just not the WARN_ON(). We do more than what is probably intended with an unaligned address. i.e, We could be modifying the flags for other pages that were not affected. e.g : In the original report [0], the trace looked like : [ 800.511498] [<ffff0000080b4f2c>] kvm_age_hva_handler+0xcc/0xd4 [ 800.517324] [<ffff0000080b4838>] handle_hva_to_gpa+0xec/0x15c [ 800.523063] [<ffff0000080b6c5c>] kvm_age_hva+0x5c/0xcc [ 800.528194] [<ffff0000080a7c3c>] kvm_mmu_notifier_clear_flush_young+0x54/0x90 [ 800.535324] [<ffff00000827a0e8>] __mmu_notifier_clear_flush_young+0x6c/0xa8 [ 800.542279] [<ffff00000825a644>] page_referenced_one+0x1e0/0x1fc [ 800.548279] [<ffff00000827e8f8>] rmap_walk_ksm+0x124/0x1a0 [ 800.553759] [<ffff00000825c974>] rmap_walk+0x94/0x98 [ 800.558717] [<ffff00000825ca98>] page_referenced+0x120/0x180 [ 800.564369] [<ffff000008228c58>] shrink_active_list+0x218/0x4a4 [ 800.570281] [<ffff000008229470>] shrink_node_memcg+0x58c/0x6fc [ 800.576107] [<ffff0000082296c4>] shrink_node+0xe4/0x328 [ 800.581325] [<ffff000008229c9c>] do_try_to_free_pages+0xe4/0x3b8 [ 800.587324] [<ffff00000822a094>] try_to_free_pages+0x124/0x234 [ 800.593150] [<ffff000008216aa0>] __alloc_pages_nodemask+0x564/0xf7c [ 800.599412] [<ffff000008292814>] khugepaged_alloc_page+0x38/0xb8 [ 800.605411] [<ffff0000082933bc>] collapse_huge_page+0x74/0xd70 [ 800.611238] [<ffff00000829470c>] khugepaged_scan_mm_slot+0x654/0xa98 [ 800.617585] [<ffff000008294e0c>] khugepaged+0x2bc/0x49c [ 800.622803] [<ffff0000080ffb70>] kthread+0x124/0x150 [ 800.627762] [<ffff0000080849f0>] ret_from_fork+0x10/0x1c [ 800.633066] ---[ end trace 944c130b5252fb01 ]--- Now, the ksm wants to mark *a page* as referenced via page_referenced_one(), passing it an unaligned address. This could eventually turn out to be one of : ptep_clear_flush_young_notify(address, address + PAGE_SIZE) or pmdp_clear_flush_young_notify(address, address + PMD_SIZE) which now spans two pages/pmds and the notifier consumer might take an action on the second page as well, which is not something intended. So, I do think that old behavior is wrong and has other side effects as mentioned above. [0] https://lkml.kernel.org/r/1525244911-5519-1-git-send-email-hejianet@gmail.com Suzuki
On 14/05/18 10:45, Suzuki K Poulose wrote: > On 10/05/18 00:31, Andrew Morton wrote: >> On Fri, 4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote: >> >>> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE >>> unaligned for rmap_item->address under memory pressure tests(start 20 guests >>> and run memhog in the host). >>> >>> ... >>> >>> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG, >>> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned. >>> Thus it will cause exceptions in handle_hva_to_gpa on arm64. >>> >>> This patch fixes it by ignoring(not removing) the low bits of address when >>> doing rmap_walk_ksm. >>> >>> Signed-off-by: jia.he@hxt-semitech.com >> >> I assumed you wanted this patch to be committed as >> From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I >> made that change. Please let me know if this was inappropriate. >> >> You can do this yourself by adding an explicit From: line to the very >> start of the patch's email text. >> >> Also, a storm of WARN_ONs is pretty poor behaviour. Is that the only >> misbehaviour which this bug causes? Do you think the fix should be >> backported into earlier kernels? >> Jia, Andrew, What is the status of this patch ? Suzuki > > I think its just not the WARN_ON(). We do more than what is probably > intended with an unaligned address. i.e, We could be modifying the > flags for other pages that were not affected. > > e.g : > > In the original report [0], the trace looked like : > > > [ 800.511498] [<ffff0000080b4f2c>] kvm_age_hva_handler+0xcc/0xd4 > [ 800.517324] [<ffff0000080b4838>] handle_hva_to_gpa+0xec/0x15c > [ 800.523063] [<ffff0000080b6c5c>] kvm_age_hva+0x5c/0xcc > [ 800.528194] [<ffff0000080a7c3c>] kvm_mmu_notifier_clear_flush_young+0x54/0x90 > [ 800.535324] [<ffff00000827a0e8>] __mmu_notifier_clear_flush_young+0x6c/0xa8 > [ 800.542279] [<ffff00000825a644>] page_referenced_one+0x1e0/0x1fc > [ 800.548279] [<ffff00000827e8f8>] rmap_walk_ksm+0x124/0x1a0 > [ 800.553759] [<ffff00000825c974>] rmap_walk+0x94/0x98 > [ 800.558717] [<ffff00000825ca98>] page_referenced+0x120/0x180 > [ 800.564369] [<ffff000008228c58>] shrink_active_list+0x218/0x4a4 > [ 800.570281] [<ffff000008229470>] shrink_node_memcg+0x58c/0x6fc > [ 800.576107] [<ffff0000082296c4>] shrink_node+0xe4/0x328 > [ 800.581325] [<ffff000008229c9c>] do_try_to_free_pages+0xe4/0x3b8 > [ 800.587324] [<ffff00000822a094>] try_to_free_pages+0x124/0x234 > [ 800.593150] [<ffff000008216aa0>] __alloc_pages_nodemask+0x564/0xf7c > [ 800.599412] [<ffff000008292814>] khugepaged_alloc_page+0x38/0xb8 > [ 800.605411] [<ffff0000082933bc>] collapse_huge_page+0x74/0xd70 > [ 800.611238] [<ffff00000829470c>] khugepaged_scan_mm_slot+0x654/0xa98 > [ 800.617585] [<ffff000008294e0c>] khugepaged+0x2bc/0x49c > [ 800.622803] [<ffff0000080ffb70>] kthread+0x124/0x150 > [ 800.627762] [<ffff0000080849f0>] ret_from_fork+0x10/0x1c > [ 800.633066] ---[ end trace 944c130b5252fb01 ]--- > > Now, the ksm wants to mark *a page* as referenced via page_referenced_one(), > passing it an unaligned address. This could eventually turn out to be > one of : > > ptep_clear_flush_young_notify(address, address + PAGE_SIZE) > > or > > pmdp_clear_flush_young_notify(address, address + PMD_SIZE) > > which now spans two pages/pmds and the notifier consumer might > take an action on the second page as well, which is not something > intended. So, I do think that old behavior is wrong and has other > side effects as mentioned above. > > [0] https://lkml.kernel.org/r/1525244911-5519-1-git-send-email-hejianet@gmail.com > > Suzuki
Hi Suzuki On 5/24/2018 4:44 PM, Suzuki K Poulose Wrote: > On 14/05/18 10:45, Suzuki K Poulose wrote: >> On 10/05/18 00:31, Andrew Morton wrote: >>> On Fri, 4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote: >>> >>>> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE >>>> unaligned for rmap_item->address under memory pressure tests(start 20 guests >>>> and run memhog in the host). >>>> >>>> ... >>>> >>>> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG, >>>> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned. >>>> Thus it will cause exceptions in handle_hva_to_gpa on arm64. >>>> >>>> This patch fixes it by ignoring(not removing) the low bits of address when >>>> doing rmap_walk_ksm. >>>> >>>> Signed-off-by: jia.he@hxt-semitech.com >>> >>> I assumed you wanted this patch to be committed as >>> From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I >>> made that change. Please let me know if this was inappropriate. >>> >>> You can do this yourself by adding an explicit From: line to the very >>> start of the patch's email text. >>> >>> Also, a storm of WARN_ONs is pretty poor behaviour. Is that the only >>> misbehaviour which this bug causes? Do you think the fix should be >>> backported into earlier kernels? >>> > > > Jia, Andrew, > > What is the status of this patch ? > > Suzuki I thought the patch is merged into mmotm tree. http://www.ozlabs.org/~akpm/mmotm/series But I don't know what is the next step. Cheers, Jia
On 24/05/18 09:50, Jia He wrote: > Hi Suzuki > > On 5/24/2018 4:44 PM, Suzuki K Poulose Wrote: >> On 14/05/18 10:45, Suzuki K Poulose wrote: >>> On 10/05/18 00:31, Andrew Morton wrote: >>>> On Fri, 4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote: >>>> >>>>> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE >>>>> unaligned for rmap_item->address under memory pressure tests(start 20 guests >>>>> and run memhog in the host). >>>>> >>>>> ... >>>>> >>>>> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG, >>>>> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned. >>>>> Thus it will cause exceptions in handle_hva_to_gpa on arm64. >>>>> >>>>> This patch fixes it by ignoring(not removing) the low bits of address when >>>>> doing rmap_walk_ksm. >>>>> >>>>> Signed-off-by: jia.he@hxt-semitech.com >>>> >>>> I assumed you wanted this patch to be committed as >>>> From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I >>>> made that change. Please let me know if this was inappropriate. >>>> >>>> You can do this yourself by adding an explicit From: line to the very >>>> start of the patch's email text. >>>> >>>> Also, a storm of WARN_ONs is pretty poor behaviour. Is that the only >>>> misbehaviour which this bug causes? Do you think the fix should be >>>> backported into earlier kernels? >>>> >> >> >> Jia, Andrew, >> >> What is the status of this patch ? >> >> Suzuki > I thought the patch is merged into mmotm tree. > http://www.ozlabs.org/~akpm/mmotm/series > But I don't know what is the next step. Hi Jia, Thanks for the update. I think that should eventually hit mainline. When it does, please could you send the patch to stable kernel versions too ? Usually having a "Cc: stable@kernel.vger.org" in the original patch (for critical fixes) would have done the trick. But since we don't have it, please send it following the stable kernel rules. Cheers Suzuki
On 5/24/2018 5:01 PM, Suzuki K Poulose Wrote: > On 24/05/18 09:50, Jia He wrote: >> Hi Suzuki >> >> On 5/24/2018 4:44 PM, Suzuki K Poulose Wrote: >>> On 14/05/18 10:45, Suzuki K Poulose wrote: >>>> On 10/05/18 00:31, Andrew Morton wrote: >>>>> On Fri, 4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote: >>>>> >>>>>> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE >>>>>> unaligned for rmap_item->address under memory pressure tests(start 20 guests >>>>>> and run memhog in the host). >>>>>> >>>>>> ... >>>>>> >>>>>> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG, >>>>>> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned. >>>>>> Thus it will cause exceptions in handle_hva_to_gpa on arm64. >>>>>> >>>>>> This patch fixes it by ignoring(not removing) the low bits of address when >>>>>> doing rmap_walk_ksm. >>>>>> >>>>>> Signed-off-by: jia.he@hxt-semitech.com >>>>> >>>>> I assumed you wanted this patch to be committed as >>>>> From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I >>>>> made that change. Please let me know if this was inappropriate. >>>>> >>>>> You can do this yourself by adding an explicit From: line to the very >>>>> start of the patch's email text. >>>>> >>>>> Also, a storm of WARN_ONs is pretty poor behaviour. Is that the only >>>>> misbehaviour which this bug causes? Do you think the fix should be >>>>> backported into earlier kernels? >>>>> >>> >>> >>> Jia, Andrew, >>> >>> What is the status of this patch ? >>> >>> Suzuki >> I thought the patch is merged into mmotm tree. >> http://www.ozlabs.org/~akpm/mmotm/series >> But I don't know what is the next step. > > Hi Jia, > > Thanks for the update. I think that should eventually hit mainline. When it does, > please could you send the patch to stable kernel versions too ? > > Usually having a "Cc: stable@kernel.vger.org" in the original patch (for > critical fixes) would have done the trick. But since we don't have it, > please send it following the stable kernel rules. > > Cheers > Suzuki > Ok,thanks for pointing
On Thu, 24 May 2018 09:44:16 +0100 Suzuki K Poulose <Suzuki.Poulose@arm.com> wrote: > On 14/05/18 10:45, Suzuki K Poulose wrote: > > On 10/05/18 00:31, Andrew Morton wrote: > >> On Fri, 4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote: > >> > >>> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE > >>> unaligned for rmap_item->address under memory pressure tests(start 20 guests > >>> and run memhog in the host). > >>> > >>> ... > >>> > >>> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG, > >>> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned. > >>> Thus it will cause exceptions in handle_hva_to_gpa on arm64. > >>> > >>> This patch fixes it by ignoring(not removing) the low bits of address when > >>> doing rmap_walk_ksm. > >>> > >>> Signed-off-by: jia.he@hxt-semitech.com > >> > >> I assumed you wanted this patch to be committed as > >> From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I > >> made that change. Please let me know if this was inappropriate. > >> > >> You can do this yourself by adding an explicit From: line to the very > >> start of the patch's email text. > >> > >> Also, a storm of WARN_ONs is pretty poor behaviour. Is that the only > >> misbehaviour which this bug causes? Do you think the fix should be > >> backported into earlier kernels? > >> > > > Jia, Andrew, > > What is the status of this patch ? > I have it scheduled for 4.18-rc1, with a cc:stable for backporting. I'd normally put such a fix into 4.17-rcX but I'd like to give Hugh time to review it and to generally give it a bit more time for review and test. Have you tested it yourself?
diff --git a/mm/ksm.c b/mm/ksm.c index e3cbf9a..e6a9640 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -199,6 +199,8 @@ struct rmap_item { #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ #define STABLE_FLAG 0x200 /* is listed from the stable tree */ +#define KSM_FLAG_MASK (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG) + /* to mask all the flags */ /* The stable and unstable tree heads */ static struct rb_root one_stable_tree[1] = { RB_ROOT }; @@ -2570,10 +2572,15 @@ void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) anon_vma_lock_read(anon_vma); anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, 0, ULONG_MAX) { + unsigned long addr; + cond_resched(); vma = vmac->vma; - if (rmap_item->address < vma->vm_start || - rmap_item->address >= vma->vm_end) + + /* Ignore the stable/unstable/sqnr flags */ + addr = rmap_item->address & ~KSM_FLAG_MASK; + + if (addr < vma->vm_start || addr >= vma->vm_end) continue; /* * Initially we examine only the vma which covers this @@ -2587,8 +2594,7 @@ void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - if (!rwc->rmap_one(page, vma, - rmap_item->address, rwc->arg)) { + if (!rwc->rmap_one(page, vma, addr, rwc->arg)) { anon_vma_unlock_read(anon_vma); return; }
In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE unaligned for rmap_item->address under memory pressure tests(start 20 guests and run memhog in the host). --------------------------begin-------------------------------------- [ 410.853828] WARNING: CPU: 4 PID: 4641 at arch/arm64/kvm/../../../virt/kvm/arm/mmu.c:1826 kvm_age_hva_handler+0xc0/0xc8 [ 410.864518] Modules linked in: vhost_net vhost tap xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 ip6t_rpfilter ipt_REJECT nf_reject_ipv4 ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute bridge stp llc ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm ib_ucm ib_umad rdma_cm ib_cm iw_cm mlx5_ib vfat fat ib_uverbs dm_mirror dm_region_hash ib_core dm_log dm_mod crc32_ce ipmi_ssif sg nfsd [ 410.935101] auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c mlx5_core ixgbe mlxfw devlink mdio ahci_platform libahci_platform qcom_emac libahci hdma hdma_mgmt i2c_qup [ 410.951369] CPU: 4 PID: 4641 Comm: memhog Tainted: G W 4.17.0-rc3+ #8 [ 410.959104] Hardware name: <snip for confidential issues> [ 410.969791] pstate: 80400005 (Nzcv daif +PAN -UAO) [ 410.974575] pc : kvm_age_hva_handler+0xc0/0xc8 [ 410.979012] lr : handle_hva_to_gpa+0xa8/0xe0 [ 410.983274] sp : ffff801761553290 [ 410.986581] x29: ffff801761553290 x28: 0000000000000000 [ 410.991888] x27: 0000000000000002 x26: 0000000000000000 [ 410.997195] x25: ffff801765430058 x24: ffff0000080b5608 [ 411.002501] x23: 0000000000000000 x22: ffff8017ccb84000 [ 411.007807] x21: 0000000003ff0000 x20: ffff8017ccb84000 [ 411.013113] x19: 000000000000fe00 x18: ffff000008fb3c08 [ 411.018419] x17: 0000000000000000 x16: 0060001645820bd3 [ 411.023725] x15: ffff80176aacbc08 x14: 0000000000000000 [ 411.029031] x13: 0000000000000040 x12: 0000000000000228 [ 411.034337] x11: 0000000000000000 x10: 0000000000000000 [ 411.039643] x9 : 0000000000000010 x8 : 0000000000000004 [ 411.044949] x7 : 0000000000000000 x6 : 00008017f0770000 [ 411.050255] x5 : 0000fffda59f0200 x4 : 0000000000000000 [ 411.055561] x3 : 0000000000000000 x2 : 000000000000fe00 [ 411.060867] x1 : 0000000003ff0000 x0 : 0000000020000000 [ 411.066173] Call trace: [ 411.068614] kvm_age_hva_handler+0xc0/0xc8 [ 411.072703] handle_hva_to_gpa+0xa8/0xe0 [ 411.076619] kvm_age_hva+0x4c/0xe8 [ 411.080014] kvm_mmu_notifier_clear_flush_young+0x54/0x98 [ 411.085408] __mmu_notifier_clear_flush_young+0x6c/0xa0 [ 411.090627] page_referenced_one+0x154/0x1d8 [ 411.094890] rmap_walk_ksm+0x12c/0x1d0 [ 411.098632] rmap_walk+0x94/0xa0 [ 411.101854] page_referenced+0x194/0x1b0 [ 411.105770] shrink_page_list+0x674/0xc28 [ 411.109772] shrink_inactive_list+0x26c/0x5b8 [ 411.114122] shrink_node_memcg+0x35c/0x620 [ 411.118211] shrink_node+0x100/0x430 [ 411.121778] do_try_to_free_pages+0xe0/0x3a8 [ 411.126041] try_to_free_pages+0xe4/0x230 [ 411.130045] __alloc_pages_nodemask+0x564/0xdc0 [ 411.134569] alloc_pages_vma+0x90/0x228 [ 411.138398] do_anonymous_page+0xc8/0x4d0 [ 411.142400] __handle_mm_fault+0x4a0/0x508 [ 411.146489] handle_mm_fault+0xf8/0x1b0 [ 411.150321] do_page_fault+0x218/0x4b8 [ 411.154064] do_translation_fault+0x90/0xa0 [ 411.158239] do_mem_abort+0x68/0xf0 [ 411.161721] el0_da+0x24/0x28 ---------------------------end--------------------------------------- In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG, then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned. Thus it will cause exceptions in handle_hva_to_gpa on arm64. This patch fixes it by ignoring(not removing) the low bits of address when doing rmap_walk_ksm. Signed-off-by: jia.he@hxt-semitech.com --- v2: refine the codes as suggested by Claudio Imbrenda mm/ksm.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-)