Message ID | 20220509130805.20335-6-mgorman@techsingularity.net (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Drain remote per-cpu directly v2 | expand |
On Mon, 9 May 2022, Mel Gorman wrote: > Currently the PCP lists are protected by using local_lock_irqsave to > prevent migration and IRQ reentrancy but this is inconvenient. Remote > draining of the lists is impossible and a workqueue is required and > every task allocation/free must disable then enable interrupts which is > expensive. > > As preparation for dealing with both of those problems, protect the > lists with a spinlock. The IRQ-unsafe version of the lock is used > because IRQs are already disabled by local_lock_irqsave. spin_trylock > is used in preparation for a time when local_lock could be used instead > of lock_lock_irqsave. 8c580f60a145 ("mm/page_alloc: protect PCP lists with a spinlock") in next-20220520: I haven't looked up whether that comes from a stable or unstable suburb of akpm's tree. Mel, the VM_BUG_ON(in_hardirq()) which this adds to free_unref_page_list() is not valid. I have no appreciation of how important it is to the whole scheme, but as it stands, it crashes; and when I change it to a warning --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3475,7 +3475,7 @@ void free_unref_page_list(struct list_he if (list_empty(list)) return; - VM_BUG_ON(in_hardirq()); + WARN_ON_ONCE(in_hardirq()); local_lock_irqsave(&pagesets.lock, flags); then everything *appears* to go on working correctly after the splat below (from which you will infer that I'm swapping to nvme): [ 256.167040] WARNING: CPU: 0 PID: 9842 at mm/page_alloc.c:3478 free_unref_page_list+0x92/0x343 [ 256.170031] CPU: 0 PID: 9842 Comm: cc1 Not tainted 5.18.0-rc7-n20 #3 [ 256.171285] Hardware name: LENOVO 20HQS0EG02/20HQS0EG02, BIOS N1MET54W (1.39 ) 04/16/2019 [ 256.172555] RIP: 0010:free_unref_page_list+0x92/0x343 [ 256.173820] Code: ff ff 49 8b 44 24 08 4d 89 e0 4c 8d 60 f8 eb b6 48 8b 03 48 39 c3 0f 84 af 02 00 00 65 8b 05 72 7f df 7e a9 00 00 0f 00 74 02 <0f> 0b 9c 41 5d fa 41 0f ba e5 09 73 05 e8 1f 0a f9 ff e8 46 90 7b [ 256.175289] RSP: 0018:ffff88803ec07c80 EFLAGS: 00010006 [ 256.176683] RAX: 0000000080010000 RBX: ffff88803ec07cf8 RCX: 000000000000002c [ 256.178122] RDX: 0000000000000000 RSI: ffff88803ec29d28 RDI: 0000000000000040 [ 256.179580] RBP: ffff88803ec07cc0 R08: ffff88803ec07cf0 R09: 00000000000a401d [ 256.181031] R10: 0000000000000000 R11: ffff8880101891b8 R12: ffff88803f6dd600 [ 256.182501] R13: ffff88803ec07cf8 R14: 000000000000000f R15: 0000000000000000 [ 256.183957] FS: 00007ffff7fcfac0(0000) GS:ffff88803ec00000(0000) knlGS:0000000000000000 [ 256.185419] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 256.186911] CR2: 0000555555710cdc CR3: 00000000240b4004 CR4: 00000000003706f0 [ 256.188395] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 256.189888] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 256.191390] Call Trace: [ 256.192844] <IRQ> [ 256.194253] ? __mem_cgroup_uncharge_list+0x4e/0x57 [ 256.195715] release_pages+0x26f/0x27e [ 256.197150] ? list_add_tail+0x39/0x39 [ 256.198603] pagevec_lru_move_fn+0x95/0xa4 [ 256.200065] folio_rotate_reclaimable+0xa0/0xd1 [ 256.201545] folio_end_writeback+0x1c/0x78 [ 256.203064] end_page_writeback+0x11/0x13 [ 256.204495] end_swap_bio_write+0x87/0x95 [ 256.205967] bio_endio+0x15e/0x162 [ 256.207393] blk_mq_end_request_batch+0xd2/0x18d [ 256.208932] ? __this_cpu_preempt_check+0x13/0x15 [ 256.210378] ? lock_is_held_type+0xcf/0x10f [ 256.211714] ? lock_is_held+0xc/0xe [ 256.213065] ? rcu_read_lock_sched_held+0x24/0x4f [ 256.214450] nvme_pci_complete_batch+0x4c/0x51 [ 256.215811] nvme_irq+0x43/0x4e [ 256.217173] ? nvme_unmap_data+0xb5/0xb5 [ 256.218633] __handle_irq_event_percpu+0xff/0x235 [ 256.220062] handle_irq_event_percpu+0x10/0x39 [ 256.221584] handle_irq_event+0x34/0x53 [ 256.223061] handle_edge_irq+0xb1/0xd5 [ 256.224486] __common_interrupt+0x7a/0xe6 [ 256.225918] common_interrupt+0x9c/0xca [ 256.227330] </IRQ> [ 256.228763] <TASK> [ 256.230226] asm_common_interrupt+0x2c/0x40 [ 256.231724] RIP: 0010:lock_acquire.part.0+0x1a9/0x1b4 [ 256.233190] Code: df ec 7e ff c8 74 19 0f 0b 48 c7 c7 77 2c 4e 82 e8 d7 e7 88 00 65 c7 05 01 df ec 7e 00 00 00 00 48 85 db 74 01 fb 48 8d 65 d8 <5b> 41 5c 41 5d 41 5e 41 5f 5d c3 55 48 89 e5 41 57 4d 89 cf 41 56 [ 256.234804] RSP: 0018:ffff888010e939f0 EFLAGS: 00000206 [ 256.236339] RAX: 0000000000000000 RBX: 0000000000000200 RCX: 3e4406066e4f4abc [ 256.237889] RDX: 0000000000000000 RSI: ffffffff824021e7 RDI: ffffffff8244b3d1 [ 256.239401] RBP: ffff888010e93a18 R08: 0000000000000028 R09: 000000000002001d [ 256.240994] R10: 0000000000000000 R11: ffff8880101891b8 R12: 0000000000000002 [ 256.242545] R13: ffffffff82757a80 R14: ffffffff81253954 R15: 0000000000000000 [ 256.244085] ? __folio_memcg_unlock+0x48/0x48 [ 256.245597] lock_acquire+0xfa/0x10a [ 256.247179] ? __folio_memcg_unlock+0x48/0x48 [ 256.248727] rcu_lock_acquire.constprop.0+0x24/0x27 [ 256.250293] ? __folio_memcg_unlock+0x48/0x48 [ 256.251736] mem_cgroup_iter+0x3d/0x178 [ 256.253245] shrink_node_memcgs+0x169/0x182 [ 256.254822] shrink_node+0x220/0x3d9 [ 256.256372] shrink_zones+0x10f/0x1ca [ 256.257923] ? __this_cpu_preempt_check+0x13/0x15 [ 256.259437] do_try_to_free_pages+0x7a/0x192 [ 256.260947] try_to_free_mem_cgroup_pages+0x14b/0x213 [ 256.262405] try_charge_memcg+0x230/0x433 [ 256.263865] try_charge+0x12/0x17 [ 256.265236] charge_memcg+0x25/0x7c [ 256.266615] __mem_cgroup_charge+0x28/0x3d [ 256.267962] mem_cgroup_charge.constprop.0+0x1d/0x1f [ 256.269290] do_anonymous_page+0x118/0x20c [ 256.270712] handle_pte_fault+0x151/0x15f [ 256.272015] __handle_mm_fault+0x39d/0x3ac [ 256.273198] handle_mm_fault+0xc2/0x188 [ 256.274371] do_user_addr_fault+0x240/0x39d [ 256.275595] exc_page_fault+0x1e1/0x204 [ 256.276787] asm_exc_page_fault+0x2c/0x40 [ 256.277906] RIP: 0033:0xd67250 [ 256.279017] Code: 02 f6 ff 49 89 c0 48 83 fb 08 73 1e f6 c3 04 75 39 48 85 db 74 2d c6 00 00 f6 c3 02 74 25 31 c0 66 41 89 44 18 fe eb 1b 66 90 <48> c7 44 18 f8 00 00 00 00 48 8d 4b ff 31 c0 4c 89 c7 48 c1 e9 03 [ 256.280355] RSP: 002b:00007fffffffc360 EFLAGS: 00010206 [ 256.281678] RAX: 00007ffff5972000 RBX: 00000000000000a8 RCX: 000000000000000c [ 256.282955] RDX: 0000000000000006 RSI: 0000000000000017 RDI: 0000000000000986 [ 256.284264] RBP: 0000000000000002 R08: 00007ffff5972000 R09: 0000000000000987 [ 256.285554] R10: 0000000000000001 R11: 0000000001000001 R12: 00007ffff5969c80 [ 256.286930] R13: 0000000000000015 R14: 0000000000000000 R15: 0000000001eb4760 [ 256.288300] </TASK> [ 256.289533] irq event stamp: 95044 [ 256.290776] hardirqs last enabled at (95043): [<ffffffff819ddf43>] irqentry_exit+0x67/0x75 [ 256.292147] hardirqs last disabled at (95044): [<ffffffff819db09c>] common_interrupt+0x1a/0xca [ 256.293479] softirqs last enabled at (94982): [<ffffffff81c0036f>] __do_softirq+0x36f/0x3aa [ 256.294754] softirqs last disabled at (94977): [<ffffffff81105c01>] __irq_exit_rcu+0x85/0xc1
On Sat, May 21, 2022 at 07:49:10PM -0700, Hugh Dickins wrote: > On Mon, 9 May 2022, Mel Gorman wrote: > > > Currently the PCP lists are protected by using local_lock_irqsave to > > prevent migration and IRQ reentrancy but this is inconvenient. Remote > > draining of the lists is impossible and a workqueue is required and > > every task allocation/free must disable then enable interrupts which is > > expensive. > > > > As preparation for dealing with both of those problems, protect the > > lists with a spinlock. The IRQ-unsafe version of the lock is used > > because IRQs are already disabled by local_lock_irqsave. spin_trylock > > is used in preparation for a time when local_lock could be used instead > > of lock_lock_irqsave. > > 8c580f60a145 ("mm/page_alloc: protect PCP lists with a spinlock") > in next-20220520: I haven't looked up whether that comes from a > stable or unstable suburb of akpm's tree. > > Mel, the VM_BUG_ON(in_hardirq()) which this adds to free_unref_page_list() > is not valid. I have no appreciation of how important it is to the whole > scheme, but as it stands, it crashes; and when I change it to a warning > Thanks Hugh. Sorry for the delay in responding, I was offline for a few days. The context where free_unref_page_list is called from IRQ context is safe and the VM_BUG_ON can be removed. --8<-- mm/page_alloc: Protect PCP lists with a spinlock -fix Hugh Dickins reported the following problem; [ 256.167040] WARNING: CPU: 0 PID: 9842 at mm/page_alloc.c:3478 free_unref_page_list+0x92/0x343 [ 256.170031] CPU: 0 PID: 9842 Comm: cc1 Not tainted 5.18.0-rc7-n20 #3 [ 256.171285] Hardware name: LENOVO 20HQS0EG02/20HQS0EG02, BIOS N1MET54W (1.39 ) 04/16/2019 [ 256.172555] RIP: 0010:free_unref_page_list+0x92/0x343 [ 256.173820] Code: ff ff 49 8b 44 24 08 4d 89 e0 4c 8d 60 f8 eb b6 48 8b 03 48 39 c3 0f 84 af 02 00 00 65 8b 05 72 7f df 7e a9 00 00 0f 00 74 02 <0f> 0b 9c 41 5d fa 41 0f ba e5 09 73 05 e8 1f 0a f9 ff e8 46 90 7b [ 256.175289] RSP: 0018:ffff88803ec07c80 EFLAGS: 00010006 [ 256.176683] RAX: 0000000080010000 RBX: ffff88803ec07cf8 RCX: 000000000000002c [ 256.178122] RDX: 0000000000000000 RSI: ffff88803ec29d28 RDI: 0000000000000040 [ 256.179580] RBP: ffff88803ec07cc0 R08: ffff88803ec07cf0 R09: 00000000000a401d [ 256.181031] R10: 0000000000000000 R11: ffff8880101891b8 R12: ffff88803f6dd600 [ 256.182501] R13: ffff88803ec07cf8 R14: 000000000000000f R15: 0000000000000000 [ 256.183957] FS: 00007ffff7fcfac0(0000) GS:ffff88803ec00000(0000) knlGS:0000000000000000 [ 256.185419] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 256.186911] CR2: 0000555555710cdc CR3: 00000000240b4004 CR4: 00000000003706f0 [ 256.188395] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 256.189888] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 256.191390] Call Trace: [ 256.192844] <IRQ> [ 256.194253] ? __mem_cgroup_uncharge_list+0x4e/0x57 [ 256.195715] release_pages+0x26f/0x27e [ 256.197150] ? list_add_tail+0x39/0x39 [ 256.198603] pagevec_lru_move_fn+0x95/0xa4 The VM_BUG_ON was added as preparing for a time when the PCP was an IRQ-unsafe lock. The fundamental limitation is that free_unref_page_list() cannot be called with the PCP lock held as a normal spinlock when an IRQ is delivered. At the moment, this is impossible and even if PCP was an IRQ-unsafe lock, free_unref_page_list is not called from page allocator context in an unsafe manner. Remove the VM_BUG_ON. This is a fix for the mmotm patch mm-page_alloc-protect-pcp-lists-with-a-spinlock.patch Reported-by: Hugh Dickins <hughd@google.com> Signed-off-by: Mel Gorman <mgorman@techsingularity.net> --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0d169aeeac6f..4c1e2a773e47 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3522,8 +3522,6 @@ void free_unref_page_list(struct list_head *list) if (list_empty(list)) return; - VM_BUG_ON(in_hardirq()); - page = lru_to_page(list); locked_zone = page_zone(page); pcp = pcp_spin_lock(locked_zone->per_cpu_pageset);
On Tue, May 24, 2022 at 01:12:24PM +0100, Mel Gorman wrote: > On Sat, May 21, 2022 at 07:49:10PM -0700, Hugh Dickins wrote: > > On Mon, 9 May 2022, Mel Gorman wrote: > > > > > Currently the PCP lists are protected by using local_lock_irqsave to > > > prevent migration and IRQ reentrancy but this is inconvenient. Remote > > > draining of the lists is impossible and a workqueue is required and > > > every task allocation/free must disable then enable interrupts which is > > > expensive. > > > > > > As preparation for dealing with both of those problems, protect the > > > lists with a spinlock. The IRQ-unsafe version of the lock is used > > > because IRQs are already disabled by local_lock_irqsave. spin_trylock > > > is used in preparation for a time when local_lock could be used instead > > > of lock_lock_irqsave. > > > > 8c580f60a145 ("mm/page_alloc: protect PCP lists with a spinlock") > > in next-20220520: I haven't looked up whether that comes from a > > stable or unstable suburb of akpm's tree. > > > > Mel, the VM_BUG_ON(in_hardirq()) which this adds to free_unref_page_list() > > is not valid. I have no appreciation of how important it is to the whole > > scheme, but as it stands, it crashes; and when I change it to a warning > > > > Thanks Hugh. Sorry for the delay in responding, I was offline for a few > days. The context where free_unref_page_list is called from IRQ context > is safe and the VM_BUG_ON can be removed. > Version that has a more appropriate baseline, it'll cause a collision later in the series but it's trivially resolved. --8<-- mm/page_alloc: Protect PCP lists with a spinlock -fix Hugh Dickins reported the following problem; [ 256.167040] WARNING: CPU: 0 PID: 9842 at mm/page_alloc.c:3478 free_unref_page_list+0x92/0x343 [ 256.170031] CPU: 0 PID: 9842 Comm: cc1 Not tainted 5.18.0-rc7-n20 #3 [ 256.171285] Hardware name: LENOVO 20HQS0EG02/20HQS0EG02, BIOS N1MET54W (1.39 ) 04/16/2019 [ 256.172555] RIP: 0010:free_unref_page_list+0x92/0x343 [ 256.173820] Code: ff ff 49 8b 44 24 08 4d 89 e0 4c 8d 60 f8 eb b6 48 8b 03 48 39 c3 0f 84 af 02 00 00 65 8b 05 72 7f df 7e a9 00 00 0f 00 74 02 <0f> 0b 9c 41 5d fa 41 0f ba e5 09 73 05 e8 1f 0a f9 ff e8 46 90 7b [ 256.175289] RSP: 0018:ffff88803ec07c80 EFLAGS: 00010006 [ 256.176683] RAX: 0000000080010000 RBX: ffff88803ec07cf8 RCX: 000000000000002c [ 256.178122] RDX: 0000000000000000 RSI: ffff88803ec29d28 RDI: 0000000000000040 [ 256.179580] RBP: ffff88803ec07cc0 R08: ffff88803ec07cf0 R09: 00000000000a401d [ 256.181031] R10: 0000000000000000 R11: ffff8880101891b8 R12: ffff88803f6dd600 [ 256.182501] R13: ffff88803ec07cf8 R14: 000000000000000f R15: 0000000000000000 [ 256.183957] FS: 00007ffff7fcfac0(0000) GS:ffff88803ec00000(0000) knlGS:0000000000000000 [ 256.185419] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 256.186911] CR2: 0000555555710cdc CR3: 00000000240b4004 CR4: 00000000003706f0 [ 256.188395] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 256.189888] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 256.191390] Call Trace: [ 256.192844] <IRQ> [ 256.194253] ? __mem_cgroup_uncharge_list+0x4e/0x57 [ 256.195715] release_pages+0x26f/0x27e [ 256.197150] ? list_add_tail+0x39/0x39 [ 256.198603] pagevec_lru_move_fn+0x95/0xa4 The VM_BUG_ON was added as preparing for a time when the PCP was an IRQ-unsafe lock. The fundamental limitation is that free_unref_page_list() cannot be called with the PCP lock held when an IRQ is delivered. At the moment, this is impossible and even if PCP was an IRQ-unsafe lock, free_unref_page_list is not called from page allocator context in an unsafe manner. Remove the VM_BUG_ON. This is a fix to the mmotm patch mm-page_alloc-protect-pcp-lists-with-a-spinlock.patch Reported-by: Hugh Dickins <hughd@google.com> Signed-off-by: Mel Gorman <mgorman@techsingularity.net> diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e3a6aa97ad7a..52e7fe681483 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3535,8 +3535,6 @@ void free_unref_page_list(struct list_head *list) if (list_empty(list)) return; - VM_BUG_ON(in_hardirq()); - local_lock_irqsave(&pagesets.lock, flags); page = lru_to_page(list);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index abe530748de6..8b5757735428 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -385,6 +385,7 @@ enum zone_watermarks { /* Fields and list protected by pagesets local_lock in page_alloc.c */ struct per_cpu_pages { + spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dc0fdeb3795c..7be21254087c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -132,6 +132,20 @@ static DEFINE_PER_CPU(struct pagesets, pagesets) __maybe_unused = { .lock = INIT_LOCAL_LOCK(lock), }; +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +/* + * On SMP, spin_trylock is sufficient protection. + * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. + */ +#define pcp_trylock_prepare(flags) do { } while (0) +#define pcp_trylock_finish(flag) do { } while (0) +#else + +/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ +#define pcp_trylock_prepare(flags) local_irq_save(flags) +#define pcp_trylock_finish(flags) local_irq_restore(flags) +#endif + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -3082,15 +3096,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, */ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { - unsigned long flags; int to_drain, batch; - local_lock_irqsave(&pagesets.lock, flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); - if (to_drain > 0) + if (to_drain > 0) { + unsigned long flags; + + /* + * free_pcppages_bulk expects IRQs disabled for zone->lock + * so even though pcp->lock is not intended to be IRQ-safe, + * it's needed in this context. + */ + spin_lock_irqsave(&pcp->lock, flags); free_pcppages_bulk(zone, to_drain, pcp, 0); - local_unlock_irqrestore(&pagesets.lock, flags); + spin_unlock_irqrestore(&pcp->lock, flags); + } } #endif @@ -3103,16 +3124,17 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) */ static void drain_pages_zone(unsigned int cpu, struct zone *zone) { - unsigned long flags; struct per_cpu_pages *pcp; - local_lock_irqsave(&pagesets.lock, flags); - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - if (pcp->count) - free_pcppages_bulk(zone, pcp->count, pcp, 0); + if (pcp->count) { + unsigned long flags; - local_unlock_irqrestore(&pagesets.lock, flags); + /* See drain_zone_pages on why this is disabling IRQs */ + spin_lock_irqsave(&pcp->lock, flags); + free_pcppages_bulk(zone, pcp->count, pcp, 0); + spin_unlock_irqrestore(&pcp->lock, flags); + } } /* @@ -3380,18 +3402,30 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return min(READ_ONCE(pcp->batch) << 2, high); } -static void free_unref_page_commit(struct page *page, int migratetype, - unsigned int order) +/* Returns true if the page was committed to the per-cpu list. */ +static bool free_unref_page_commit(struct page *page, int migratetype, + unsigned int order, bool locked) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; int high; int pindex; bool free_high; + unsigned long __maybe_unused UP_flags; __count_vm_event(PGFREE); pcp = this_cpu_ptr(zone->per_cpu_pageset); pindex = order_to_pindex(migratetype, order); + + if (!locked) { + /* Protect against a parallel drain. */ + pcp_trylock_prepare(UP_flags); + if (!spin_trylock(&pcp->lock)) { + pcp_trylock_finish(UP_flags); + return false; + } + } + list_add(&page->pcp_list, &pcp->lists[pindex]); pcp->count += 1 << order; @@ -3409,6 +3443,13 @@ static void free_unref_page_commit(struct page *page, int migratetype, free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex); } + + if (!locked) { + spin_unlock(&pcp->lock); + pcp_trylock_finish(UP_flags); + } + + return true; } /* @@ -3419,6 +3460,7 @@ void free_unref_page(struct page *page, unsigned int order) unsigned long flags; unsigned long pfn = page_to_pfn(page); int migratetype; + bool freed_pcp = false; if (!free_unref_page_prepare(page, pfn, order)) return; @@ -3440,8 +3482,11 @@ void free_unref_page(struct page *page, unsigned int order) } local_lock_irqsave(&pagesets.lock, flags); - free_unref_page_commit(page, migratetype, order); + freed_pcp = free_unref_page_commit(page, migratetype, order, false); local_unlock_irqrestore(&pagesets.lock, flags); + + if (unlikely(!freed_pcp)) + free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); } /* @@ -3450,10 +3495,19 @@ void free_unref_page(struct page *page, unsigned int order) void free_unref_page_list(struct list_head *list) { struct page *page, *next; + struct per_cpu_pages *pcp; + struct zone *locked_zone; unsigned long flags; int batch_count = 0; int migratetype; + /* + * An empty list is possible. Check early so that the later + * lru_to_page() does not potentially read garbage. + */ + if (list_empty(list)) + return; + /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { unsigned long pfn = page_to_pfn(page); @@ -3474,8 +3528,33 @@ void free_unref_page_list(struct list_head *list) } } + /* + * Preparation could have drained the list due to failing to prepare + * or all pages are being isolated. + */ + if (list_empty(list)) + return; + + VM_BUG_ON(in_hardirq()); + local_lock_irqsave(&pagesets.lock, flags); + + page = lru_to_page(list); + locked_zone = page_zone(page); + pcp = this_cpu_ptr(locked_zone->per_cpu_pageset); + spin_lock(&pcp->lock); + list_for_each_entry_safe(page, next, list, lru) { + struct zone *zone = page_zone(page); + + /* Different zone, different pcp lock. */ + if (zone != locked_zone) { + spin_unlock(&pcp->lock); + locked_zone = zone; + pcp = this_cpu_ptr(zone->per_cpu_pageset); + spin_lock(&pcp->lock); + } + /* * Non-isolated types over MIGRATE_PCPTYPES get added * to the MIGRATE_MOVABLE pcp list. @@ -3485,18 +3564,33 @@ void free_unref_page_list(struct list_head *list) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(page); - free_unref_page_commit(page, migratetype, 0); + + /* + * If there is a parallel drain in progress, free to the buddy + * allocator directly. This is expensive as the zone lock will + * be acquired multiple times but if a drain is in progress + * then an expensive operation is already taking place. + * + * TODO: Always false at the moment due to local_lock_irqsave + * and is preparation for converting to local_lock. + */ + if (unlikely(!free_unref_page_commit(page, migratetype, 0, true))) + free_one_page(page_zone(page), page, page_to_pfn(page), 0, migratetype, FPI_NONE); /* * Guard against excessive IRQ disabled times when we get * a large list of pages to free. */ if (++batch_count == SWAP_CLUSTER_MAX) { + spin_unlock(&pcp->lock); local_unlock_irqrestore(&pagesets.lock, flags); batch_count = 0; local_lock_irqsave(&pagesets.lock, flags); + pcp = this_cpu_ptr(locked_zone->per_cpu_pageset); + spin_lock(&pcp->lock); } } + spin_unlock(&pcp->lock); local_unlock_irqrestore(&pagesets.lock, flags); } @@ -3668,9 +3762,28 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, int migratetype, unsigned int alloc_flags, struct per_cpu_pages *pcp, - struct list_head *list) + struct list_head *list, + bool locked) { struct page *page; + unsigned long __maybe_unused UP_flags; + + /* + * spin_trylock is not necessary right now due to due to + * local_lock_irqsave and is a preparation step for + * a conversion to local_lock using the trylock to prevent + * IRQ re-entrancy. If pcp->lock cannot be acquired, the caller + * uses rmqueue_buddy. + * + * TODO: Convert local_lock_irqsave to local_lock. + */ + if (unlikely(!locked)) { + pcp_trylock_prepare(UP_flags); + if (!spin_trylock(&pcp->lock)) { + pcp_trylock_finish(UP_flags); + return NULL; + } + } do { if (list_empty(list)) { @@ -3691,8 +3804,10 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, migratetype, alloc_flags); pcp->count += alloced << order; - if (unlikely(list_empty(list))) - return NULL; + if (unlikely(list_empty(list))) { + page = NULL; + goto out; + } } page = list_first_entry(list, struct page, lru); @@ -3700,6 +3815,12 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, pcp->count -= 1 << order; } while (check_new_pcp(page, order)); +out: + if (!locked) { + spin_unlock(&pcp->lock); + pcp_trylock_finish(UP_flags); + } + return page; } @@ -3724,7 +3845,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp->free_factor >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; - page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); + page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list, false); local_unlock_irqrestore(&pagesets.lock, flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); @@ -3759,7 +3880,8 @@ struct page *rmqueue(struct zone *preferred_zone, migratetype != MIGRATE_MOVABLE) { page = rmqueue_pcplist(preferred_zone, zone, order, gfp_flags, migratetype, alloc_flags); - goto out; + if (likely(page)) + goto out; } } @@ -5326,6 +5448,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, local_lock_irqsave(&pagesets.lock, flags); pcp = this_cpu_ptr(zone->per_cpu_pageset); pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)]; + spin_lock(&pcp->lock); while (nr_populated < nr_pages) { @@ -5336,11 +5459,13 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, } page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, - pcp, pcp_list); + pcp, pcp_list, true); if (unlikely(!page)) { /* Try and get at least one page */ - if (!nr_populated) + if (!nr_populated) { + spin_unlock(&pcp->lock); goto failed_irq; + } break; } nr_account++; @@ -5353,6 +5478,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } + spin_unlock(&pcp->lock); local_unlock_irqrestore(&pagesets.lock, flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); @@ -6992,6 +7118,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta memset(pcp, 0, sizeof(*pcp)); memset(pzstats, 0, sizeof(*pzstats)); + spin_lock_init(&pcp->lock); for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) INIT_LIST_HEAD(&pcp->lists[pindex]);
Currently the PCP lists are protected by using local_lock_irqsave to prevent migration and IRQ reentrancy but this is inconvenient. Remote draining of the lists is impossible and a workqueue is required and every task allocation/free must disable then enable interrupts which is expensive. As preparation for dealing with both of those problems, protect the lists with a spinlock. The IRQ-unsafe version of the lock is used because IRQs are already disabled by local_lock_irqsave. spin_trylock is used in preparation for a time when local_lock could be used instead of lock_lock_irqsave. The per_cpu_pages still fits within the same number of cache lines after this patch relative to before the series. struct per_cpu_pages { spinlock_t lock; /* 0 4 */ int count; /* 4 4 */ int high; /* 8 4 */ int batch; /* 12 4 */ short int free_factor; /* 16 2 */ short int expire; /* 18 2 */ /* XXX 4 bytes hole, try to pack */ struct list_head lists[13]; /* 24 208 */ /* size: 256, cachelines: 4, members: 7 */ /* sum members: 228, holes: 1, sum holes: 4 */ /* padding: 24 */ } __attribute__((__aligned__(64))); Signed-off-by: Mel Gorman <mgorman@techsingularity.net> --- include/linux/mmzone.h | 1 + mm/page_alloc.c | 169 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 149 insertions(+), 21 deletions(-)