diff mbox

[v2] mm/ksm: ignore STABLE_FLAG of rmap_item->address in rmap_walk_ksm

Message ID 1525403506-6750-1-git-send-email-hejianet@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jia He May 4, 2018, 3:11 a.m. UTC
In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE
unaligned for rmap_item->address under memory pressure tests(start 20 guests
and run memhog in the host).

--------------------------begin--------------------------------------
[  410.853828] WARNING: CPU: 4 PID: 4641 at
arch/arm64/kvm/../../../virt/kvm/arm/mmu.c:1826
kvm_age_hva_handler+0xc0/0xc8
[  410.864518] Modules linked in: vhost_net vhost tap xt_CHECKSUM
ipt_MASQUERADE nf_nat_masquerade_ipv4 ip6t_rpfilter ipt_REJECT
nf_reject_ipv4 ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink
ebtable_nat ebtable_broute bridge stp llc ip6table_nat nf_conntrack_ipv6
nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security
ip6table_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4
nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw
ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter
rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi
ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm
ib_ucm ib_umad rdma_cm ib_cm iw_cm mlx5_ib vfat fat ib_uverbs dm_mirror
dm_region_hash ib_core dm_log dm_mod crc32_ce ipmi_ssif sg nfsd
[  410.935101]  auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs
libcrc32c mlx5_core ixgbe mlxfw devlink mdio ahci_platform
libahci_platform qcom_emac libahci hdma hdma_mgmt i2c_qup
[  410.951369] CPU: 4 PID: 4641 Comm: memhog Tainted: G        W
4.17.0-rc3+ #8
[  410.959104] Hardware name: <snip for confidential issues>
[  410.969791] pstate: 80400005 (Nzcv daif +PAN -UAO)
[  410.974575] pc : kvm_age_hva_handler+0xc0/0xc8
[  410.979012] lr : handle_hva_to_gpa+0xa8/0xe0
[  410.983274] sp : ffff801761553290
[  410.986581] x29: ffff801761553290 x28: 0000000000000000
[  410.991888] x27: 0000000000000002 x26: 0000000000000000
[  410.997195] x25: ffff801765430058 x24: ffff0000080b5608
[  411.002501] x23: 0000000000000000 x22: ffff8017ccb84000
[  411.007807] x21: 0000000003ff0000 x20: ffff8017ccb84000
[  411.013113] x19: 000000000000fe00 x18: ffff000008fb3c08
[  411.018419] x17: 0000000000000000 x16: 0060001645820bd3
[  411.023725] x15: ffff80176aacbc08 x14: 0000000000000000
[  411.029031] x13: 0000000000000040 x12: 0000000000000228
[  411.034337] x11: 0000000000000000 x10: 0000000000000000
[  411.039643] x9 : 0000000000000010 x8 : 0000000000000004
[  411.044949] x7 : 0000000000000000 x6 : 00008017f0770000
[  411.050255] x5 : 0000fffda59f0200 x4 : 0000000000000000
[  411.055561] x3 : 0000000000000000 x2 : 000000000000fe00
[  411.060867] x1 : 0000000003ff0000 x0 : 0000000020000000
[  411.066173] Call trace:
[  411.068614]  kvm_age_hva_handler+0xc0/0xc8
[  411.072703]  handle_hva_to_gpa+0xa8/0xe0
[  411.076619]  kvm_age_hva+0x4c/0xe8
[  411.080014]  kvm_mmu_notifier_clear_flush_young+0x54/0x98
[  411.085408]  __mmu_notifier_clear_flush_young+0x6c/0xa0
[  411.090627]  page_referenced_one+0x154/0x1d8
[  411.094890]  rmap_walk_ksm+0x12c/0x1d0
[  411.098632]  rmap_walk+0x94/0xa0
[  411.101854]  page_referenced+0x194/0x1b0
[  411.105770]  shrink_page_list+0x674/0xc28
[  411.109772]  shrink_inactive_list+0x26c/0x5b8
[  411.114122]  shrink_node_memcg+0x35c/0x620
[  411.118211]  shrink_node+0x100/0x430
[  411.121778]  do_try_to_free_pages+0xe0/0x3a8
[  411.126041]  try_to_free_pages+0xe4/0x230
[  411.130045]  __alloc_pages_nodemask+0x564/0xdc0
[  411.134569]  alloc_pages_vma+0x90/0x228
[  411.138398]  do_anonymous_page+0xc8/0x4d0
[  411.142400]  __handle_mm_fault+0x4a0/0x508
[  411.146489]  handle_mm_fault+0xf8/0x1b0
[  411.150321]  do_page_fault+0x218/0x4b8
[  411.154064]  do_translation_fault+0x90/0xa0
[  411.158239]  do_mem_abort+0x68/0xf0
[  411.161721]  el0_da+0x24/0x28
---------------------------end---------------------------------------

In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG,
then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned.
Thus it will cause exceptions in handle_hva_to_gpa on arm64.

This patch fixes it by ignoring(not removing) the low bits of address when
doing rmap_walk_ksm.

Signed-off-by: jia.he@hxt-semitech.com
---
v2: refine the codes as suggested by Claudio Imbrenda

 mm/ksm.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

Comments

Jia He May 4, 2018, 5:56 a.m. UTC | #1
[+Hugh Dickins]

Cheers,
Jia
On 5/4/2018 11:11 AM, Jia He Wrote:
> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE
> unaligned for rmap_item->address under memory pressure tests(start 20 guests
> and run memhog in the host).
>
> --------------------------begin--------------------------------------
> [  410.853828] WARNING: CPU: 4 PID: 4641 at
> arch/arm64/kvm/../../../virt/kvm/arm/mmu.c:1826
> kvm_age_hva_handler+0xc0/0xc8
> [  410.864518] Modules linked in: vhost_net vhost tap xt_CHECKSUM
> ipt_MASQUERADE nf_nat_masquerade_ipv4 ip6t_rpfilter ipt_REJECT
> nf_reject_ipv4 ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink
> ebtable_nat ebtable_broute bridge stp llc ip6table_nat nf_conntrack_ipv6
> nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security
> ip6table_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4
> nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw
> ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter
> rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi
> ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm
> ib_ucm ib_umad rdma_cm ib_cm iw_cm mlx5_ib vfat fat ib_uverbs dm_mirror
> dm_region_hash ib_core dm_log dm_mod crc32_ce ipmi_ssif sg nfsd
> [  410.935101]  auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs
> libcrc32c mlx5_core ixgbe mlxfw devlink mdio ahci_platform
> libahci_platform qcom_emac libahci hdma hdma_mgmt i2c_qup
> [  410.951369] CPU: 4 PID: 4641 Comm: memhog Tainted: G        W
> 4.17.0-rc3+ #8
> [  410.959104] Hardware name: <snip for confidential issues>
> [  410.969791] pstate: 80400005 (Nzcv daif +PAN -UAO)
> [  410.974575] pc : kvm_age_hva_handler+0xc0/0xc8
> [  410.979012] lr : handle_hva_to_gpa+0xa8/0xe0
> [  410.983274] sp : ffff801761553290
> [  410.986581] x29: ffff801761553290 x28: 0000000000000000
> [  410.991888] x27: 0000000000000002 x26: 0000000000000000
> [  410.997195] x25: ffff801765430058 x24: ffff0000080b5608
> [  411.002501] x23: 0000000000000000 x22: ffff8017ccb84000
> [  411.007807] x21: 0000000003ff0000 x20: ffff8017ccb84000
> [  411.013113] x19: 000000000000fe00 x18: ffff000008fb3c08
> [  411.018419] x17: 0000000000000000 x16: 0060001645820bd3
> [  411.023725] x15: ffff80176aacbc08 x14: 0000000000000000
> [  411.029031] x13: 0000000000000040 x12: 0000000000000228
> [  411.034337] x11: 0000000000000000 x10: 0000000000000000
> [  411.039643] x9 : 0000000000000010 x8 : 0000000000000004
> [  411.044949] x7 : 0000000000000000 x6 : 00008017f0770000
> [  411.050255] x5 : 0000fffda59f0200 x4 : 0000000000000000
> [  411.055561] x3 : 0000000000000000 x2 : 000000000000fe00
> [  411.060867] x1 : 0000000003ff0000 x0 : 0000000020000000
> [  411.066173] Call trace:
> [  411.068614]  kvm_age_hva_handler+0xc0/0xc8
> [  411.072703]  handle_hva_to_gpa+0xa8/0xe0
> [  411.076619]  kvm_age_hva+0x4c/0xe8
> [  411.080014]  kvm_mmu_notifier_clear_flush_young+0x54/0x98
> [  411.085408]  __mmu_notifier_clear_flush_young+0x6c/0xa0
> [  411.090627]  page_referenced_one+0x154/0x1d8
> [  411.094890]  rmap_walk_ksm+0x12c/0x1d0
> [  411.098632]  rmap_walk+0x94/0xa0
> [  411.101854]  page_referenced+0x194/0x1b0
> [  411.105770]  shrink_page_list+0x674/0xc28
> [  411.109772]  shrink_inactive_list+0x26c/0x5b8
> [  411.114122]  shrink_node_memcg+0x35c/0x620
> [  411.118211]  shrink_node+0x100/0x430
> [  411.121778]  do_try_to_free_pages+0xe0/0x3a8
> [  411.126041]  try_to_free_pages+0xe4/0x230
> [  411.130045]  __alloc_pages_nodemask+0x564/0xdc0
> [  411.134569]  alloc_pages_vma+0x90/0x228
> [  411.138398]  do_anonymous_page+0xc8/0x4d0
> [  411.142400]  __handle_mm_fault+0x4a0/0x508
> [  411.146489]  handle_mm_fault+0xf8/0x1b0
> [  411.150321]  do_page_fault+0x218/0x4b8
> [  411.154064]  do_translation_fault+0x90/0xa0
> [  411.158239]  do_mem_abort+0x68/0xf0
> [  411.161721]  el0_da+0x24/0x28
> ---------------------------end---------------------------------------
>
> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG,
> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned.
> Thus it will cause exceptions in handle_hva_to_gpa on arm64.
>
> This patch fixes it by ignoring(not removing) the low bits of address when
> doing rmap_walk_ksm.
>
> Signed-off-by: jia.he@hxt-semitech.com
> ---
> v2: refine the codes as suggested by Claudio Imbrenda
>
>   mm/ksm.c | 14 ++++++++++----
>   1 file changed, 10 insertions(+), 4 deletions(-)
>
> diff --git a/mm/ksm.c b/mm/ksm.c
> index e3cbf9a..e6a9640 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -199,6 +199,8 @@ struct rmap_item {
>   #define SEQNR_MASK	0x0ff	/* low bits of unstable tree seqnr */
>   #define UNSTABLE_FLAG	0x100	/* is a node of the unstable tree */
>   #define STABLE_FLAG	0x200	/* is listed from the stable tree */
> +#define KSM_FLAG_MASK	(SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
> +				/* to mask all the flags */
>   
>   /* The stable and unstable tree heads */
>   static struct rb_root one_stable_tree[1] = { RB_ROOT };
> @@ -2570,10 +2572,15 @@ void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
>   		anon_vma_lock_read(anon_vma);
>   		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
>   					       0, ULONG_MAX) {
> +			unsigned long addr;
> +
>   			cond_resched();
>   			vma = vmac->vma;
> -			if (rmap_item->address < vma->vm_start ||
> -			    rmap_item->address >= vma->vm_end)
> +
> +			/* Ignore the stable/unstable/sqnr flags */
> +			addr = rmap_item->address & ~KSM_FLAG_MASK;
> +
> +			if (addr < vma->vm_start || addr >= vma->vm_end)
>   				continue;
>   			/*
>   			 * Initially we examine only the vma which covers this
> @@ -2587,8 +2594,7 @@ void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
>   			if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
>   				continue;
>   
> -			if (!rwc->rmap_one(page, vma,
> -					rmap_item->address, rwc->arg)) {
> +			if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
>   				anon_vma_unlock_read(anon_vma);
>   				return;
>   			}
Andrew Morton May 9, 2018, 11:31 p.m. UTC | #2
On Fri,  4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote:

> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE
> unaligned for rmap_item->address under memory pressure tests(start 20 guests
> and run memhog in the host).
> 
> ...
> 
> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG,
> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned.
> Thus it will cause exceptions in handle_hva_to_gpa on arm64.
> 
> This patch fixes it by ignoring(not removing) the low bits of address when
> doing rmap_walk_ksm.
> 
> Signed-off-by: jia.he@hxt-semitech.com

I assumed you wanted this patch to be committed as
From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I
made that change.  Please let me know if this was inappropriate.

You can do this yourself by adding an explicit From: line to the very
start of the patch's email text.

Also, a storm of WARN_ONs is pretty poor behaviour.  Is that the only
misbehaviour which this bug causes?  Do you think the fix should be
backported into earlier kernels?
Jia He May 10, 2018, 1:26 a.m. UTC | #3
Hi Andrew


On 5/10/2018 7:31 AM, Andrew Morton Wrote:
> On Fri,  4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote:
>
>> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE
>> unaligned for rmap_item->address under memory pressure tests(start 20 guests
>> and run memhog in the host).
>>
>> ...
>>
>> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG,
>> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned.
>> Thus it will cause exceptions in handle_hva_to_gpa on arm64.
>>
>> This patch fixes it by ignoring(not removing) the low bits of address when
>> doing rmap_walk_ksm.
>>
>> Signed-off-by: jia.he@hxt-semitech.com
> I assumed you wanted this patch to be committed as
> From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I
> made that change.  Please let me know if this was inappropriate.
Thanks, because there is still some issues in our company's mail server.
I have to use my gmail mailbox.
>
> You can do this yourself by adding an explicit From: line to the very
> start of the patch's email text.
>
> Also, a storm of WARN_ONs is pretty poor behaviour.  Is that the only
> misbehaviour which this bug causes?  Do you think the fix should be
> backported into earlier kernels?
IMO, it should be backported to stable tree, seems that I missed CC to stable 
tree ;-)
the stom of WARN_ONs is very easy for me to reproduce.
More than that, I watched a panic (not reproducible) as follows:
[35380.805825] page:ffff7fe003742d80 count:-4871 mapcount:-2126053375 
mapping:          (null) index:0x0
[35380.815024] flags: 0x1fffc00000000000()
[35380.818845] raw: 1fffc00000000000 0000000000000000 0000000000000000 
ffffecf981470000
[35380.826569] raw: dead000000000100 dead000000000200 ffff8017c001c000 
0000000000000000
[35380.834294] page dumped because: nonzero _refcount
[35380.839069] Modules linked in: vhost_net vhost tap ebtable_filter ebtables 
ip6table_filter ip6_tables iptable_filter fcoe libfcoe libfc 8021q garp mrp stp 
llc scsi_transport_fc openvswitch nf_conntrack_ipv6 nf_nat_ipv6 
nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_defrag_ipv6 nf_nat nf_conntrack 
vfat fat rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi 
ib_srpt target_core_mod ib_srp scsi_transport_srp ib_ipoib rdma_ucm ib_ucm 
ib_uverbs ib_umad rdma_cm ib_cm iw_cm mlx5_ib ib_core crc32_ce ipmi_ssif tpm_tis 
tpm_tis_core sg nfsd auth_rpcgss nfs_acl lockd grace sunrpc dm_multipath 
ip_tables xfs libcrc32c mlx5_core mlxfw devlink ahci_platform libahci_platform 
libahci qcom_emac sdhci_acpi sdhci hdma mmc_core hdma_mgmt i2c_qup dm_mirror 
dm_region_hash dm_log dm_mod
[35380.908341] CPU: 29 PID: 18323 Comm: qemu-kvm Tainted: G W       
4.14.15-5.hxt.aarch64 #1
[35380.917107] Hardware name: <snip for confidential issues>
[35380.930909] Call trace:
[35380.933345] [<ffff000008088f00>] dump_backtrace+0x0/0x22c
[35380.938723] [<ffff000008089150>] show_stack+0x24/0x2c
[35380.943759] [<ffff00000893c078>] dump_stack+0x8c/0xb0
[35380.948794] [<ffff00000820ab50>] bad_page+0xf4/0x154
[35380.953740] [<ffff000008211ce8>] free_pages_check_bad+0x90/0x9c
[35380.959642] [<ffff00000820c430>] free_pcppages_bulk+0x464/0x518
[35380.965545] [<ffff00000820db98>] free_hot_cold_page+0x22c/0x300
[35380.971448] [<ffff0000082176fc>] __put_page+0x54/0x60
[35380.976484] [<ffff0000080b1164>] unmap_stage2_range+0x170/0x2b4
[35380.982385] [<ffff0000080b12d8>] kvm_unmap_hva_handler+0x30/0x40
[35380.988375] [<ffff0000080b0104>] handle_hva_to_gpa+0xb0/0xec
[35380.994016] [<ffff0000080b2644>] kvm_unmap_hva_range+0x5c/0xd0
[35380.999833] [<ffff0000080a8054>] 
kvm_mmu_notifier_invalidate_range_start+0x60/0xb0
[35381.007387] [<ffff000008271f44>] __mmu_notifier_invalidate_range_start+0x64/0x8c
[35381.014765] [<ffff0000082547c8>] try_to_unmap_one+0x78c/0x7a4
[35381.020493] [<ffff000008276d04>] rmap_walk_ksm+0x124/0x1a0
[35381.025961] [<ffff0000082551b4>] rmap_walk+0x94/0x98
[35381.030909] [<ffff0000082555e4>] try_to_unmap+0x100/0x124
[35381.036293] [<ffff00000828243c>] unmap_and_move+0x480/0x6fc
[35381.041847] [<ffff000008282b6c>] migrate_pages+0x10c/0x288
[35381.047318] [<ffff00000823c164>] compact_zone+0x238/0x954
[35381.052697] [<ffff00000823c944>] compact_zone_order+0xc4/0xe8
[35381.058427] [<ffff00000823d25c>] try_to_compact_pages+0x160/0x294
[35381.064503] [<ffff00000820f074>] __alloc_pages_direct_compact+0x68/0x194
[35381.071187] [<ffff000008210138>] __alloc_pages_nodemask+0xc20/0xf7c
[35381.077437] [<ffff0000082709e4>] alloc_pages_vma+0x1a4/0x1c0
[35381.083080] [<ffff000008285b68>] do_huge_pmd_anonymous_page+0x128/0x324
[35381.089677] [<ffff000008248a24>] __handle_mm_fault+0x71c/0x7e8
[35381.095492] [<ffff000008248be8>] handle_mm_fault+0xf8/0x194
[35381.101049] [<ffff000008240dcc>] __get_user_pages+0x124/0x34c
[35381.106777] [<ffff000008241870>] populate_vma_page_range+0x90/0x9c
[35381.112941] [<ffff000008241940>] __mm_populate+0xc4/0x15c
[35381.118322] [<ffff00000824b294>] SyS_mlockall+0x100/0x164
[35381.123705] Exception stack(0xffff800dce5f3ec0 to 0xffff800dce5f4000)
[35381.130128] 3ec0: 0000000000000003 d6e6024cc9b87e00 0000aaaabe94f000 
0000000000000000
[35381.137940] 3ee0: 0000000000000002 0000000000000000 0000000000000000 
0000aaaacf6fc3c0
[35381.145753] 3f00: 00000000000000e6 0000aaaacf6fc490 0000ffffeeeab0f0 
d6e6024cc9b87e00
[35381.153565] 3f20: 0000000000000000 0000aaaabe81b3c0 0000000000000020 
00009e53eff806b5
[35381.161379] 3f40: 0000aaaabe94de48 0000ffffa7c269b0 0000000000000011 
0000ffffeeeabf68
[35381.169190] 3f60: 0000aaaaceacfe60 0000aaaabe94f000 0000aaaabe9ba358 
0000aaaabe7ffb80
[35381.177003] 3f80: 0000aaaabe9ba000 0000aaaabe959f64 0000000000000000 
0000aaaabe94f000
[35381.184815] 3fa0: 0000000000000000 0000ffffeeeabdb0 0000aaaabe5f3bf8 
0000ffffeeeabdb0
[35381.192628] 3fc0: 0000ffffa7c269b8 0000000060000000 0000000000000003 
00000000000000e6
[35381.200440] 3fe0: 0000000000000000 0000000000000000 0000000000000000 
0000000000000000
[35381.208254] [<ffff00000808339c>] __sys_trace_return+0x0/0x4
[35381.213809] Disabling lock debugging due to kernel taint

I ever injected a fault on purpose in kvm_unmap_hva_range by set 
size=size-0x200, the call trace is similar
as above. Thus, I thought the panic is similarly caused by the root cause of WARN_ON
Suzuki K Poulose May 14, 2018, 9:09 a.m. UTC | #4
On 10/05/18 02:26, Jia He wrote:
> Hi Andrew
> 
> 
> On 5/10/2018 7:31 AM, Andrew Morton Wrote:
>> On Fri,  4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote:
>>
>>> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE
>>> unaligned for rmap_item->address under memory pressure tests(start 20 guests
>>> and run memhog in the host).
>>>
>>> ...
>>>
>>> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG,
>>> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned.
>>> Thus it will cause exceptions in handle_hva_to_gpa on arm64.
>>>
>>> This patch fixes it by ignoring(not removing) the low bits of address when
>>> doing rmap_walk_ksm.
>>>
>>> Signed-off-by: jia.he@hxt-semitech.com
>> I assumed you wanted this patch to be committed as
>> From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I
>> made that change.  Please let me know if this was inappropriate.
> Thanks, because there is still some issues in our company's mail server.
> I have to use my gmail mailbox.
>>
>> You can do this yourself by adding an explicit From: line to the very
>> start of the patch's email text.
>>
>> Also, a storm of WARN_ONs is pretty poor behaviour.  Is that the only
>> misbehaviour which this bug causes?  Do you think the fix should be
>> backported into earlier kernels?
> IMO, it should be backported to stable tree, seems that I missed CC to stable tree ;-)
> the stom of WARN_ONs is very easy for me to reproduce.
> More than that, I watched a panic (not reproducible) as follows:


> [35380.805825] page:ffff7fe003742d80 count:-4871 mapcount:-2126053375 mapping:          (null) index:0x0
> [35380.815024] flags: 0x1fffc00000000000()
> [35380.818845] raw: 1fffc00000000000 0000000000000000 0000000000000000 ffffecf981470000
> [35380.826569] raw: dead000000000100 dead000000000200 ffff8017c001c000 0000000000000000
> [35380.834294] page dumped because: nonzero _refcount

> [35380.908341] CPU: 29 PID: 18323 Comm: qemu-kvm Tainted: G W 4.14.15-5.hxt.aarch64 #1
> [35380.917107] Hardware name: <snip for confidential issues>
> [35380.930909] Call trace:
> [35380.933345] [<ffff000008088f00>] dump_backtrace+0x0/0x22c
> [35380.938723] [<ffff000008089150>] show_stack+0x24/0x2c
> [35380.943759] [<ffff00000893c078>] dump_stack+0x8c/0xb0
> [35380.948794] [<ffff00000820ab50>] bad_page+0xf4/0x154
> [35380.953740] [<ffff000008211ce8>] free_pages_check_bad+0x90/0x9c
> [35380.959642] [<ffff00000820c430>] free_pcppages_bulk+0x464/0x518
> [35380.965545] [<ffff00000820db98>] free_hot_cold_page+0x22c/0x300
> [35380.971448] [<ffff0000082176fc>] __put_page+0x54/0x60
> [35380.976484] [<ffff0000080b1164>] unmap_stage2_range+0x170/0x2b4
> [35380.982385] [<ffff0000080b12d8>] kvm_unmap_hva_handler+0x30/0x40
> [35380.988375] [<ffff0000080b0104>] handle_hva_to_gpa+0xb0/0xec
> [35380.994016] [<ffff0000080b2644>] kvm_unmap_hva_range+0x5c/0xd0
> [35380.999833] [<ffff0000080a8054>] kvm_mmu_notifier_invalidate_range_start+0x60/0xb0
> [35381.007387] [<ffff000008271f44>] __mmu_notifier_invalidate_range_start+0x64/0x8c
> [35381.014765] [<ffff0000082547c8>] try_to_unmap_one+0x78c/0x7a4
> [35381.020493] [<ffff000008276d04>] rmap_walk_ksm+0x124/0x1a0
> [35381.025961] [<ffff0000082551b4>] rmap_walk+0x94/0x98
> [35381.030909] [<ffff0000082555e4>] try_to_unmap+0x100/0x124
> [35381.036293] [<ffff00000828243c>] unmap_and_move+0x480/0x6fc
> [35381.041847] [<ffff000008282b6c>] migrate_pages+0x10c/0x288
> [35381.047318] [<ffff00000823c164>] compact_zone+0x238/0x954
> [35381.052697] [<ffff00000823c944>] compact_zone_order+0xc4/0xe8
> [35381.058427] [<ffff00000823d25c>] try_to_compact_pages+0x160/0x294
> [35381.064503] [<ffff00000820f074>] __alloc_pages_direct_compact+0x68/0x194
> [35381.071187] [<ffff000008210138>] __alloc_pages_nodemask+0xc20/0xf7c
> [35381.077437] [<ffff0000082709e4>] alloc_pages_vma+0x1a4/0x1c0
> [35381.083080] [<ffff000008285b68>] do_huge_pmd_anonymous_page+0x128/0x324
> [35381.089677] [<ffff000008248a24>] __handle_mm_fault+0x71c/0x7e8
> [35381.095492] [<ffff000008248be8>] handle_mm_fault+0xf8/0x194
> [35381.101049] [<ffff000008240dcc>] __get_user_pages+0x124/0x34c
> [35381.106777] [<ffff000008241870>] populate_vma_page_range+0x90/0x9c
> [35381.112941] [<ffff000008241940>] __mm_populate+0xc4/0x15c
> [35381.118322] [<ffff00000824b294>] SyS_mlockall+0x100/0x164
> [35381.123705] Exception stack(0xffff800dce5f3ec0 to 0xffff800dce5f4000)
> [35381.130128] 3ec0: 0000000000000003 d6e6024cc9b87e00 0000aaaabe94f000 0000000000000000
> [35381.137940] 3ee0: 0000000000000002 0000000000000000 0000000000000000 0000aaaacf6fc3c0
> [35381.145753] 3f00: 00000000000000e6 0000aaaacf6fc490 0000ffffeeeab0f0 d6e6024cc9b87e00
> [35381.153565] 3f20: 0000000000000000 0000aaaabe81b3c0 0000000000000020 00009e53eff806b5
> [35381.161379] 3f40: 0000aaaabe94de48 0000ffffa7c269b0 0000000000000011 0000ffffeeeabf68
> [35381.169190] 3f60: 0000aaaaceacfe60 0000aaaabe94f000 0000aaaabe9ba358 0000aaaabe7ffb80
> [35381.177003] 3f80: 0000aaaabe9ba000 0000aaaabe959f64 0000000000000000 0000aaaabe94f000
> [35381.184815] 3fa0: 0000000000000000 0000ffffeeeabdb0 0000aaaabe5f3bf8 0000ffffeeeabdb0
> [35381.192628] 3fc0: 0000ffffa7c269b8 0000000060000000 0000000000000003 00000000000000e6
> [35381.200440] 3fe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> [35381.208254] [<ffff00000808339c>] __sys_trace_return+0x0/0x4
> [35381.213809] Disabling lock debugging due to kernel taint
> 
> I ever injected a fault on purpose in kvm_unmap_hva_range by set size=size-0x200, the call trace is similar
> as above. Thus, I thought the panic is similarly caused by the root cause of WARN_ON


Please could you share your "changes" (that injected the fault) that triggered this Panic
and the steps that triggered this ?

The only reason we should get there is by trying to put a page that is not owned by the KVM
Stage 2 page table either:

1) It was free'd already ? - We has some race conditions there which were
fixed.
2) The code tries to access something that doesn't belong there. - If this happens
that doesn't look good for a simple change you mentioned. So we would like to
know better about the situation to see if there is something we need to address.

Suzuki
Suzuki K Poulose May 14, 2018, 9:45 a.m. UTC | #5
On 10/05/18 00:31, Andrew Morton wrote:
> On Fri,  4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote:
> 
>> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE
>> unaligned for rmap_item->address under memory pressure tests(start 20 guests
>> and run memhog in the host).
>>
>> ...
>>
>> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG,
>> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned.
>> Thus it will cause exceptions in handle_hva_to_gpa on arm64.
>>
>> This patch fixes it by ignoring(not removing) the low bits of address when
>> doing rmap_walk_ksm.
>>
>> Signed-off-by: jia.he@hxt-semitech.com
> 
> I assumed you wanted this patch to be committed as
> From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I
> made that change.  Please let me know if this was inappropriate.
> 
> You can do this yourself by adding an explicit From: line to the very
> start of the patch's email text.
> 
> Also, a storm of WARN_ONs is pretty poor behaviour.  Is that the only
> misbehaviour which this bug causes?  Do you think the fix should be
> backported into earlier kernels?
> 

I think its just not the WARN_ON(). We do more than what is probably
intended with an unaligned address. i.e, We could be modifying the
flags for other pages that were not affected.

e.g :

In the original report [0], the trace looked like :


[  800.511498] [<ffff0000080b4f2c>] kvm_age_hva_handler+0xcc/0xd4
[  800.517324] [<ffff0000080b4838>] handle_hva_to_gpa+0xec/0x15c
[  800.523063] [<ffff0000080b6c5c>] kvm_age_hva+0x5c/0xcc
[  800.528194] [<ffff0000080a7c3c>] kvm_mmu_notifier_clear_flush_young+0x54/0x90
[  800.535324] [<ffff00000827a0e8>] __mmu_notifier_clear_flush_young+0x6c/0xa8
[  800.542279] [<ffff00000825a644>] page_referenced_one+0x1e0/0x1fc
[  800.548279] [<ffff00000827e8f8>] rmap_walk_ksm+0x124/0x1a0
[  800.553759] [<ffff00000825c974>] rmap_walk+0x94/0x98
[  800.558717] [<ffff00000825ca98>] page_referenced+0x120/0x180
[  800.564369] [<ffff000008228c58>] shrink_active_list+0x218/0x4a4
[  800.570281] [<ffff000008229470>] shrink_node_memcg+0x58c/0x6fc
[  800.576107] [<ffff0000082296c4>] shrink_node+0xe4/0x328
[  800.581325] [<ffff000008229c9c>] do_try_to_free_pages+0xe4/0x3b8
[  800.587324] [<ffff00000822a094>] try_to_free_pages+0x124/0x234
[  800.593150] [<ffff000008216aa0>] __alloc_pages_nodemask+0x564/0xf7c
[  800.599412] [<ffff000008292814>] khugepaged_alloc_page+0x38/0xb8
[  800.605411] [<ffff0000082933bc>] collapse_huge_page+0x74/0xd70
[  800.611238] [<ffff00000829470c>] khugepaged_scan_mm_slot+0x654/0xa98
[  800.617585] [<ffff000008294e0c>] khugepaged+0x2bc/0x49c
[  800.622803] [<ffff0000080ffb70>] kthread+0x124/0x150
[  800.627762] [<ffff0000080849f0>] ret_from_fork+0x10/0x1c
[  800.633066] ---[ end trace 944c130b5252fb01 ]---

Now, the ksm wants to mark *a page* as referenced via page_referenced_one(),
passing it an unaligned address. This could eventually turn out to be
one of :

ptep_clear_flush_young_notify(address, address + PAGE_SIZE)

or

pmdp_clear_flush_young_notify(address, address + PMD_SIZE)

which now spans two pages/pmds and the notifier consumer might
take an action on the second page as well, which is not something
intended. So, I do think that old behavior is wrong and has other
side effects as mentioned above.

[0] https://lkml.kernel.org/r/1525244911-5519-1-git-send-email-hejianet@gmail.com

Suzuki
Suzuki K Poulose May 24, 2018, 8:44 a.m. UTC | #6
On 14/05/18 10:45, Suzuki K Poulose wrote:
> On 10/05/18 00:31, Andrew Morton wrote:
>> On Fri,  4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote:
>>
>>> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE
>>> unaligned for rmap_item->address under memory pressure tests(start 20 guests
>>> and run memhog in the host).
>>>
>>> ...
>>>
>>> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG,
>>> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned.
>>> Thus it will cause exceptions in handle_hva_to_gpa on arm64.
>>>
>>> This patch fixes it by ignoring(not removing) the low bits of address when
>>> doing rmap_walk_ksm.
>>>
>>> Signed-off-by: jia.he@hxt-semitech.com
>>
>> I assumed you wanted this patch to be committed as
>> From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I
>> made that change.  Please let me know if this was inappropriate.
>>
>> You can do this yourself by adding an explicit From: line to the very
>> start of the patch's email text.
>>
>> Also, a storm of WARN_ONs is pretty poor behaviour.  Is that the only
>> misbehaviour which this bug causes?  Do you think the fix should be
>> backported into earlier kernels?
>>


Jia, Andrew,

What is the status of this patch ?

Suzuki

> 
> I think its just not the WARN_ON(). We do more than what is probably
> intended with an unaligned address. i.e, We could be modifying the
> flags for other pages that were not affected.
> 
> e.g :
> 
> In the original report [0], the trace looked like :
> 
> 
> [  800.511498] [<ffff0000080b4f2c>] kvm_age_hva_handler+0xcc/0xd4
> [  800.517324] [<ffff0000080b4838>] handle_hva_to_gpa+0xec/0x15c
> [  800.523063] [<ffff0000080b6c5c>] kvm_age_hva+0x5c/0xcc
> [  800.528194] [<ffff0000080a7c3c>] kvm_mmu_notifier_clear_flush_young+0x54/0x90
> [  800.535324] [<ffff00000827a0e8>] __mmu_notifier_clear_flush_young+0x6c/0xa8
> [  800.542279] [<ffff00000825a644>] page_referenced_one+0x1e0/0x1fc
> [  800.548279] [<ffff00000827e8f8>] rmap_walk_ksm+0x124/0x1a0
> [  800.553759] [<ffff00000825c974>] rmap_walk+0x94/0x98
> [  800.558717] [<ffff00000825ca98>] page_referenced+0x120/0x180
> [  800.564369] [<ffff000008228c58>] shrink_active_list+0x218/0x4a4
> [  800.570281] [<ffff000008229470>] shrink_node_memcg+0x58c/0x6fc
> [  800.576107] [<ffff0000082296c4>] shrink_node+0xe4/0x328
> [  800.581325] [<ffff000008229c9c>] do_try_to_free_pages+0xe4/0x3b8
> [  800.587324] [<ffff00000822a094>] try_to_free_pages+0x124/0x234
> [  800.593150] [<ffff000008216aa0>] __alloc_pages_nodemask+0x564/0xf7c
> [  800.599412] [<ffff000008292814>] khugepaged_alloc_page+0x38/0xb8
> [  800.605411] [<ffff0000082933bc>] collapse_huge_page+0x74/0xd70
> [  800.611238] [<ffff00000829470c>] khugepaged_scan_mm_slot+0x654/0xa98
> [  800.617585] [<ffff000008294e0c>] khugepaged+0x2bc/0x49c
> [  800.622803] [<ffff0000080ffb70>] kthread+0x124/0x150
> [  800.627762] [<ffff0000080849f0>] ret_from_fork+0x10/0x1c
> [  800.633066] ---[ end trace 944c130b5252fb01 ]---
> 
> Now, the ksm wants to mark *a page* as referenced via page_referenced_one(),
> passing it an unaligned address. This could eventually turn out to be
> one of :
> 
> ptep_clear_flush_young_notify(address, address + PAGE_SIZE)
> 
> or
> 
> pmdp_clear_flush_young_notify(address, address + PMD_SIZE)
> 
> which now spans two pages/pmds and the notifier consumer might
> take an action on the second page as well, which is not something
> intended. So, I do think that old behavior is wrong and has other
> side effects as mentioned above.
> 
> [0] https://lkml.kernel.org/r/1525244911-5519-1-git-send-email-hejianet@gmail.com
> 
> Suzuki
Jia He May 24, 2018, 8:50 a.m. UTC | #7
Hi Suzuki

On 5/24/2018 4:44 PM, Suzuki K Poulose Wrote:
> On 14/05/18 10:45, Suzuki K Poulose wrote:
>> On 10/05/18 00:31, Andrew Morton wrote:
>>> On Fri,  4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote:
>>>
>>>> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE
>>>> unaligned for rmap_item->address under memory pressure tests(start 20 guests
>>>> and run memhog in the host).
>>>>
>>>> ...
>>>>
>>>> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG,
>>>> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned.
>>>> Thus it will cause exceptions in handle_hva_to_gpa on arm64.
>>>>
>>>> This patch fixes it by ignoring(not removing) the low bits of address when
>>>> doing rmap_walk_ksm.
>>>>
>>>> Signed-off-by: jia.he@hxt-semitech.com
>>>
>>> I assumed you wanted this patch to be committed as
>>> From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I
>>> made that change.  Please let me know if this was inappropriate.
>>>
>>> You can do this yourself by adding an explicit From: line to the very
>>> start of the patch's email text.
>>>
>>> Also, a storm of WARN_ONs is pretty poor behaviour.  Is that the only
>>> misbehaviour which this bug causes?  Do you think the fix should be
>>> backported into earlier kernels?
>>>
> 
> 
> Jia, Andrew,
> 
> What is the status of this patch ?
> 
> Suzuki
I thought the patch is merged into mmotm tree.
http://www.ozlabs.org/~akpm/mmotm/series
But I don't know what is the next step.

Cheers,
Jia
Suzuki K Poulose May 24, 2018, 9:01 a.m. UTC | #8
On 24/05/18 09:50, Jia He wrote:
> Hi Suzuki
> 
> On 5/24/2018 4:44 PM, Suzuki K Poulose Wrote:
>> On 14/05/18 10:45, Suzuki K Poulose wrote:
>>> On 10/05/18 00:31, Andrew Morton wrote:
>>>> On Fri,  4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote:
>>>>
>>>>> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE
>>>>> unaligned for rmap_item->address under memory pressure tests(start 20 guests
>>>>> and run memhog in the host).
>>>>>
>>>>> ...
>>>>>
>>>>> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG,
>>>>> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned.
>>>>> Thus it will cause exceptions in handle_hva_to_gpa on arm64.
>>>>>
>>>>> This patch fixes it by ignoring(not removing) the low bits of address when
>>>>> doing rmap_walk_ksm.
>>>>>
>>>>> Signed-off-by: jia.he@hxt-semitech.com
>>>>
>>>> I assumed you wanted this patch to be committed as
>>>> From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I
>>>> made that change.  Please let me know if this was inappropriate.
>>>>
>>>> You can do this yourself by adding an explicit From: line to the very
>>>> start of the patch's email text.
>>>>
>>>> Also, a storm of WARN_ONs is pretty poor behaviour.  Is that the only
>>>> misbehaviour which this bug causes?  Do you think the fix should be
>>>> backported into earlier kernels?
>>>>
>>
>>
>> Jia, Andrew,
>>
>> What is the status of this patch ?
>>
>> Suzuki
> I thought the patch is merged into mmotm tree.
> http://www.ozlabs.org/~akpm/mmotm/series
> But I don't know what is the next step.

Hi Jia,

Thanks for the update. I think that should eventually hit mainline. When it does,
please could you send the patch to stable kernel versions too ?

Usually having a "Cc: stable@kernel.vger.org" in the original patch (for
critical fixes) would have done the trick. But since we don't have it,
please send it following the stable kernel rules.

Cheers
Suzuki
Jia He May 24, 2018, 9:36 a.m. UTC | #9
On 5/24/2018 5:01 PM, Suzuki K Poulose Wrote:
> On 24/05/18 09:50, Jia He wrote:
>> Hi Suzuki
>>
>> On 5/24/2018 4:44 PM, Suzuki K Poulose Wrote:
>>> On 14/05/18 10:45, Suzuki K Poulose wrote:
>>>> On 10/05/18 00:31, Andrew Morton wrote:
>>>>> On Fri,  4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote:
>>>>>
>>>>>> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE
>>>>>> unaligned for rmap_item->address under memory pressure tests(start 20 guests
>>>>>> and run memhog in the host).
>>>>>>
>>>>>> ...
>>>>>>
>>>>>> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG,
>>>>>> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned.
>>>>>> Thus it will cause exceptions in handle_hva_to_gpa on arm64.
>>>>>>
>>>>>> This patch fixes it by ignoring(not removing) the low bits of address when
>>>>>> doing rmap_walk_ksm.
>>>>>>
>>>>>> Signed-off-by: jia.he@hxt-semitech.com
>>>>>
>>>>> I assumed you wanted this patch to be committed as
>>>>> From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I
>>>>> made that change.  Please let me know if this was inappropriate.
>>>>>
>>>>> You can do this yourself by adding an explicit From: line to the very
>>>>> start of the patch's email text.
>>>>>
>>>>> Also, a storm of WARN_ONs is pretty poor behaviour.  Is that the only
>>>>> misbehaviour which this bug causes?  Do you think the fix should be
>>>>> backported into earlier kernels?
>>>>>
>>>
>>>
>>> Jia, Andrew,
>>>
>>> What is the status of this patch ?
>>>
>>> Suzuki
>> I thought the patch is merged into mmotm tree.
>> http://www.ozlabs.org/~akpm/mmotm/series
>> But I don't know what is the next step.
> 
> Hi Jia,
> 
> Thanks for the update. I think that should eventually hit mainline. When it does,
> please could you send the patch to stable kernel versions too ?
> 
> Usually having a "Cc: stable@kernel.vger.org" in the original patch (for
> critical fixes) would have done the trick. But since we don't have it,
> please send it following the stable kernel rules.
> 
> Cheers
> Suzuki
> 
Ok,thanks for pointing
Andrew Morton May 24, 2018, 8:38 p.m. UTC | #10
On Thu, 24 May 2018 09:44:16 +0100 Suzuki K Poulose <Suzuki.Poulose@arm.com> wrote:

> On 14/05/18 10:45, Suzuki K Poulose wrote:
> > On 10/05/18 00:31, Andrew Morton wrote:
> >> On Fri,  4 May 2018 11:11:46 +0800 Jia He <hejianet@gmail.com> wrote:
> >>
> >>> In our armv8a server(QDF2400), I noticed lots of WARN_ON caused by PAGE_SIZE
> >>> unaligned for rmap_item->address under memory pressure tests(start 20 guests
> >>> and run memhog in the host).
> >>>
> >>> ...
> >>>
> >>> In rmap_walk_ksm, the rmap_item->address might still have the STABLE_FLAG,
> >>> then the start and end in handle_hva_to_gpa might not be PAGE_SIZE aligned.
> >>> Thus it will cause exceptions in handle_hva_to_gpa on arm64.
> >>>
> >>> This patch fixes it by ignoring(not removing) the low bits of address when
> >>> doing rmap_walk_ksm.
> >>>
> >>> Signed-off-by: jia.he@hxt-semitech.com
> >>
> >> I assumed you wanted this patch to be committed as
> >> From:jia.he@hxt-semitech.com rather than From:hejianet@gmail.com, so I
> >> made that change.  Please let me know if this was inappropriate.
> >>
> >> You can do this yourself by adding an explicit From: line to the very
> >> start of the patch's email text.
> >>
> >> Also, a storm of WARN_ONs is pretty poor behaviour.  Is that the only
> >> misbehaviour which this bug causes?  Do you think the fix should be
> >> backported into earlier kernels?
> >>
> 
> 
> Jia, Andrew,
> 
> What is the status of this patch ?
> 

I have it scheduled for 4.18-rc1, with a cc:stable for backporting.

I'd normally put such a fix into 4.17-rcX but I'd like to give Hugh
time to review it and to generally give it a bit more time for review
and test.

Have you tested it yourself?
diff mbox

Patch

diff --git a/mm/ksm.c b/mm/ksm.c
index e3cbf9a..e6a9640 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -199,6 +199,8 @@  struct rmap_item {
 #define SEQNR_MASK	0x0ff	/* low bits of unstable tree seqnr */
 #define UNSTABLE_FLAG	0x100	/* is a node of the unstable tree */
 #define STABLE_FLAG	0x200	/* is listed from the stable tree */
+#define KSM_FLAG_MASK	(SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
+				/* to mask all the flags */
 
 /* The stable and unstable tree heads */
 static struct rb_root one_stable_tree[1] = { RB_ROOT };
@@ -2570,10 +2572,15 @@  void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
 		anon_vma_lock_read(anon_vma);
 		anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
 					       0, ULONG_MAX) {
+			unsigned long addr;
+
 			cond_resched();
 			vma = vmac->vma;
-			if (rmap_item->address < vma->vm_start ||
-			    rmap_item->address >= vma->vm_end)
+
+			/* Ignore the stable/unstable/sqnr flags */
+			addr = rmap_item->address & ~KSM_FLAG_MASK;
+
+			if (addr < vma->vm_start || addr >= vma->vm_end)
 				continue;
 			/*
 			 * Initially we examine only the vma which covers this
@@ -2587,8 +2594,7 @@  void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
 			if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
 				continue;
 
-			if (!rwc->rmap_one(page, vma,
-					rmap_item->address, rwc->arg)) {
+			if (!rwc->rmap_one(page, vma, addr, rwc->arg)) {
 				anon_vma_unlock_read(anon_vma);
 				return;
 			}