diff mbox series

[5/6] mm/page_alloc: Protect PCP lists with a spinlock

Message ID 20220509130805.20335-6-mgorman@techsingularity.net (mailing list archive)
State New
Headers show
Series Drain remote per-cpu directly v2 | expand

Commit Message

Mel Gorman May 9, 2022, 1:08 p.m. UTC
Currently the PCP lists are protected by using local_lock_irqsave to
prevent migration and IRQ reentrancy but this is inconvenient. Remote
draining of the lists is impossible and a workqueue is required and
every task allocation/free must disable then enable interrupts which is
expensive.

As preparation for dealing with both of those problems, protect the
lists with a spinlock. The IRQ-unsafe version of the lock is used
because IRQs are already disabled by local_lock_irqsave. spin_trylock
is used in preparation for a time when local_lock could be used instead
of lock_lock_irqsave.

The per_cpu_pages still fits within the same number of cache lines after
this patch relative to before the series.

struct per_cpu_pages {
        spinlock_t                 lock;                 /*     0     4 */
        int                        count;                /*     4     4 */
        int                        high;                 /*     8     4 */
        int                        batch;                /*    12     4 */
        short int                  free_factor;          /*    16     2 */
        short int                  expire;               /*    18     2 */

        /* XXX 4 bytes hole, try to pack */

        struct list_head           lists[13];            /*    24   208 */

        /* size: 256, cachelines: 4, members: 7 */
        /* sum members: 228, holes: 1, sum holes: 4 */
        /* padding: 24 */
} __attribute__((__aligned__(64)));

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 include/linux/mmzone.h |   1 +
 mm/page_alloc.c        | 169 ++++++++++++++++++++++++++++++++++++-----
 2 files changed, 149 insertions(+), 21 deletions(-)

Comments

Hugh Dickins May 22, 2022, 2:49 a.m. UTC | #1
On Mon, 9 May 2022, Mel Gorman wrote:

> Currently the PCP lists are protected by using local_lock_irqsave to
> prevent migration and IRQ reentrancy but this is inconvenient. Remote
> draining of the lists is impossible and a workqueue is required and
> every task allocation/free must disable then enable interrupts which is
> expensive.
> 
> As preparation for dealing with both of those problems, protect the
> lists with a spinlock. The IRQ-unsafe version of the lock is used
> because IRQs are already disabled by local_lock_irqsave. spin_trylock
> is used in preparation for a time when local_lock could be used instead
> of lock_lock_irqsave.

8c580f60a145 ("mm/page_alloc: protect PCP lists with a spinlock")
in next-20220520: I haven't looked up whether that comes from a
stable or unstable suburb of akpm's tree.

Mel, the VM_BUG_ON(in_hardirq()) which this adds to free_unref_page_list() 
is not valid.  I have no appreciation of how important it is to the whole
scheme, but as it stands, it crashes; and when I change it to a warning

--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3475,7 +3475,7 @@ void free_unref_page_list(struct list_he
 	if (list_empty(list))
 		return;
 
-	VM_BUG_ON(in_hardirq());
+	WARN_ON_ONCE(in_hardirq());
 
 	local_lock_irqsave(&pagesets.lock, flags);
 
then everything *appears* to go on working correctly after the splat
below (from which you will infer that I'm swapping to nvme):

[  256.167040] WARNING: CPU: 0 PID: 9842 at mm/page_alloc.c:3478 free_unref_page_list+0x92/0x343
[  256.170031] CPU: 0 PID: 9842 Comm: cc1 Not tainted 5.18.0-rc7-n20 #3
[  256.171285] Hardware name: LENOVO 20HQS0EG02/20HQS0EG02, BIOS N1MET54W (1.39 ) 04/16/2019
[  256.172555] RIP: 0010:free_unref_page_list+0x92/0x343
[  256.173820] Code: ff ff 49 8b 44 24 08 4d 89 e0 4c 8d 60 f8 eb b6 48 8b 03 48 39 c3 0f 84 af 02 00 00 65 8b 05 72 7f df 7e a9 00 00 0f 00 74 02 <0f> 0b 9c 41 5d fa 41 0f ba e5 09 73 05 e8 1f 0a f9 ff e8 46 90 7b
[  256.175289] RSP: 0018:ffff88803ec07c80 EFLAGS: 00010006
[  256.176683] RAX: 0000000080010000 RBX: ffff88803ec07cf8 RCX: 000000000000002c
[  256.178122] RDX: 0000000000000000 RSI: ffff88803ec29d28 RDI: 0000000000000040
[  256.179580] RBP: ffff88803ec07cc0 R08: ffff88803ec07cf0 R09: 00000000000a401d
[  256.181031] R10: 0000000000000000 R11: ffff8880101891b8 R12: ffff88803f6dd600
[  256.182501] R13: ffff88803ec07cf8 R14: 000000000000000f R15: 0000000000000000
[  256.183957] FS:  00007ffff7fcfac0(0000) GS:ffff88803ec00000(0000) knlGS:0000000000000000
[  256.185419] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  256.186911] CR2: 0000555555710cdc CR3: 00000000240b4004 CR4: 00000000003706f0
[  256.188395] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  256.189888] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  256.191390] Call Trace:
[  256.192844]  <IRQ>
[  256.194253]  ? __mem_cgroup_uncharge_list+0x4e/0x57
[  256.195715]  release_pages+0x26f/0x27e
[  256.197150]  ? list_add_tail+0x39/0x39
[  256.198603]  pagevec_lru_move_fn+0x95/0xa4
[  256.200065]  folio_rotate_reclaimable+0xa0/0xd1
[  256.201545]  folio_end_writeback+0x1c/0x78
[  256.203064]  end_page_writeback+0x11/0x13
[  256.204495]  end_swap_bio_write+0x87/0x95
[  256.205967]  bio_endio+0x15e/0x162
[  256.207393]  blk_mq_end_request_batch+0xd2/0x18d
[  256.208932]  ? __this_cpu_preempt_check+0x13/0x15
[  256.210378]  ? lock_is_held_type+0xcf/0x10f
[  256.211714]  ? lock_is_held+0xc/0xe
[  256.213065]  ? rcu_read_lock_sched_held+0x24/0x4f
[  256.214450]  nvme_pci_complete_batch+0x4c/0x51
[  256.215811]  nvme_irq+0x43/0x4e
[  256.217173]  ? nvme_unmap_data+0xb5/0xb5
[  256.218633]  __handle_irq_event_percpu+0xff/0x235
[  256.220062]  handle_irq_event_percpu+0x10/0x39
[  256.221584]  handle_irq_event+0x34/0x53
[  256.223061]  handle_edge_irq+0xb1/0xd5
[  256.224486]  __common_interrupt+0x7a/0xe6
[  256.225918]  common_interrupt+0x9c/0xca
[  256.227330]  </IRQ>
[  256.228763]  <TASK>
[  256.230226]  asm_common_interrupt+0x2c/0x40
[  256.231724] RIP: 0010:lock_acquire.part.0+0x1a9/0x1b4
[  256.233190] Code: df ec 7e ff c8 74 19 0f 0b 48 c7 c7 77 2c 4e 82 e8 d7 e7 88 00 65 c7 05 01 df ec 7e 00 00 00 00 48 85 db 74 01 fb 48 8d 65 d8 <5b> 41 5c 41 5d 41 5e 41 5f 5d c3 55 48 89 e5 41 57 4d 89 cf 41 56
[  256.234804] RSP: 0018:ffff888010e939f0 EFLAGS: 00000206
[  256.236339] RAX: 0000000000000000 RBX: 0000000000000200 RCX: 3e4406066e4f4abc
[  256.237889] RDX: 0000000000000000 RSI: ffffffff824021e7 RDI: ffffffff8244b3d1
[  256.239401] RBP: ffff888010e93a18 R08: 0000000000000028 R09: 000000000002001d
[  256.240994] R10: 0000000000000000 R11: ffff8880101891b8 R12: 0000000000000002
[  256.242545] R13: ffffffff82757a80 R14: ffffffff81253954 R15: 0000000000000000
[  256.244085]  ? __folio_memcg_unlock+0x48/0x48
[  256.245597]  lock_acquire+0xfa/0x10a
[  256.247179]  ? __folio_memcg_unlock+0x48/0x48
[  256.248727]  rcu_lock_acquire.constprop.0+0x24/0x27
[  256.250293]  ? __folio_memcg_unlock+0x48/0x48
[  256.251736]  mem_cgroup_iter+0x3d/0x178
[  256.253245]  shrink_node_memcgs+0x169/0x182
[  256.254822]  shrink_node+0x220/0x3d9
[  256.256372]  shrink_zones+0x10f/0x1ca
[  256.257923]  ? __this_cpu_preempt_check+0x13/0x15
[  256.259437]  do_try_to_free_pages+0x7a/0x192
[  256.260947]  try_to_free_mem_cgroup_pages+0x14b/0x213
[  256.262405]  try_charge_memcg+0x230/0x433
[  256.263865]  try_charge+0x12/0x17
[  256.265236]  charge_memcg+0x25/0x7c
[  256.266615]  __mem_cgroup_charge+0x28/0x3d
[  256.267962]  mem_cgroup_charge.constprop.0+0x1d/0x1f
[  256.269290]  do_anonymous_page+0x118/0x20c
[  256.270712]  handle_pte_fault+0x151/0x15f
[  256.272015]  __handle_mm_fault+0x39d/0x3ac
[  256.273198]  handle_mm_fault+0xc2/0x188
[  256.274371]  do_user_addr_fault+0x240/0x39d
[  256.275595]  exc_page_fault+0x1e1/0x204
[  256.276787]  asm_exc_page_fault+0x2c/0x40
[  256.277906] RIP: 0033:0xd67250
[  256.279017] Code: 02 f6 ff 49 89 c0 48 83 fb 08 73 1e f6 c3 04 75 39 48 85 db 74 2d c6 00 00 f6 c3 02 74 25 31 c0 66 41 89 44 18 fe eb 1b 66 90 <48> c7 44 18 f8 00 00 00 00 48 8d 4b ff 31 c0 4c 89 c7 48 c1 e9 03
[  256.280355] RSP: 002b:00007fffffffc360 EFLAGS: 00010206
[  256.281678] RAX: 00007ffff5972000 RBX: 00000000000000a8 RCX: 000000000000000c
[  256.282955] RDX: 0000000000000006 RSI: 0000000000000017 RDI: 0000000000000986
[  256.284264] RBP: 0000000000000002 R08: 00007ffff5972000 R09: 0000000000000987
[  256.285554] R10: 0000000000000001 R11: 0000000001000001 R12: 00007ffff5969c80
[  256.286930] R13: 0000000000000015 R14: 0000000000000000 R15: 0000000001eb4760
[  256.288300]  </TASK>
[  256.289533] irq event stamp: 95044
[  256.290776] hardirqs last  enabled at (95043): [<ffffffff819ddf43>] irqentry_exit+0x67/0x75
[  256.292147] hardirqs last disabled at (95044): [<ffffffff819db09c>] common_interrupt+0x1a/0xca
[  256.293479] softirqs last  enabled at (94982): [<ffffffff81c0036f>] __do_softirq+0x36f/0x3aa
[  256.294754] softirqs last disabled at (94977): [<ffffffff81105c01>] __irq_exit_rcu+0x85/0xc1
Mel Gorman May 24, 2022, 12:12 p.m. UTC | #2
On Sat, May 21, 2022 at 07:49:10PM -0700, Hugh Dickins wrote:
> On Mon, 9 May 2022, Mel Gorman wrote:
> 
> > Currently the PCP lists are protected by using local_lock_irqsave to
> > prevent migration and IRQ reentrancy but this is inconvenient. Remote
> > draining of the lists is impossible and a workqueue is required and
> > every task allocation/free must disable then enable interrupts which is
> > expensive.
> > 
> > As preparation for dealing with both of those problems, protect the
> > lists with a spinlock. The IRQ-unsafe version of the lock is used
> > because IRQs are already disabled by local_lock_irqsave. spin_trylock
> > is used in preparation for a time when local_lock could be used instead
> > of lock_lock_irqsave.
> 
> 8c580f60a145 ("mm/page_alloc: protect PCP lists with a spinlock")
> in next-20220520: I haven't looked up whether that comes from a
> stable or unstable suburb of akpm's tree.
> 
> Mel, the VM_BUG_ON(in_hardirq()) which this adds to free_unref_page_list() 
> is not valid.  I have no appreciation of how important it is to the whole
> scheme, but as it stands, it crashes; and when I change it to a warning
> 

Thanks Hugh. Sorry for the delay in responding, I was offline for a few
days. The context where free_unref_page_list is called from IRQ context
is safe and the VM_BUG_ON can be removed.

--8<--
mm/page_alloc: Protect PCP lists with a spinlock -fix

Hugh Dickins reported the following problem;

[  256.167040] WARNING: CPU: 0 PID: 9842 at mm/page_alloc.c:3478 free_unref_page_list+0x92/0x343
[  256.170031] CPU: 0 PID: 9842 Comm: cc1 Not tainted 5.18.0-rc7-n20 #3
[  256.171285] Hardware name: LENOVO 20HQS0EG02/20HQS0EG02, BIOS N1MET54W (1.39 ) 04/16/2019
[  256.172555] RIP: 0010:free_unref_page_list+0x92/0x343
[  256.173820] Code: ff ff 49 8b 44 24 08 4d 89 e0 4c 8d 60 f8 eb b6 48 8b 03 48 39 c3 0f 84 af 02 00 00 65 8b 05 72 7f df 7e a9 00 00 0f 00 74 02 <0f> 0b 9c 41 5d fa 41 0f ba e5 09 73 05 e8 1f 0a f9 ff e8 46 90 7b
[  256.175289] RSP: 0018:ffff88803ec07c80 EFLAGS: 00010006
[  256.176683] RAX: 0000000080010000 RBX: ffff88803ec07cf8 RCX: 000000000000002c
[  256.178122] RDX: 0000000000000000 RSI: ffff88803ec29d28 RDI: 0000000000000040
[  256.179580] RBP: ffff88803ec07cc0 R08: ffff88803ec07cf0 R09: 00000000000a401d
[  256.181031] R10: 0000000000000000 R11: ffff8880101891b8 R12: ffff88803f6dd600
[  256.182501] R13: ffff88803ec07cf8 R14: 000000000000000f R15: 0000000000000000
[  256.183957] FS:  00007ffff7fcfac0(0000) GS:ffff88803ec00000(0000) knlGS:0000000000000000
[  256.185419] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  256.186911] CR2: 0000555555710cdc CR3: 00000000240b4004 CR4: 00000000003706f0
[  256.188395] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  256.189888] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  256.191390] Call Trace:
[  256.192844]  <IRQ>
[  256.194253]  ? __mem_cgroup_uncharge_list+0x4e/0x57
[  256.195715]  release_pages+0x26f/0x27e
[  256.197150]  ? list_add_tail+0x39/0x39
[  256.198603]  pagevec_lru_move_fn+0x95/0xa4

The VM_BUG_ON was added as preparing for a time when the PCP was an
IRQ-unsafe lock. The fundamental limitation is that free_unref_page_list()
cannot be called with the PCP lock held as a normal spinlock when an IRQ
is delivered. At the moment, this is impossible and even if PCP was an
IRQ-unsafe lock, free_unref_page_list is not called from page allocator
context in an unsafe manner. Remove the VM_BUG_ON.

This is a fix for the mmotm patch
mm-page_alloc-protect-pcp-lists-with-a-spinlock.patch

Reported-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 mm/page_alloc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0d169aeeac6f..4c1e2a773e47 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3522,8 +3522,6 @@ void free_unref_page_list(struct list_head *list)
 	if (list_empty(list))
 		return;
 
-	VM_BUG_ON(in_hardirq());
-
 	page = lru_to_page(list);
 	locked_zone = page_zone(page);
 	pcp = pcp_spin_lock(locked_zone->per_cpu_pageset);
Mel Gorman May 24, 2022, 12:19 p.m. UTC | #3
On Tue, May 24, 2022 at 01:12:24PM +0100, Mel Gorman wrote:
> On Sat, May 21, 2022 at 07:49:10PM -0700, Hugh Dickins wrote:
> > On Mon, 9 May 2022, Mel Gorman wrote:
> > 
> > > Currently the PCP lists are protected by using local_lock_irqsave to
> > > prevent migration and IRQ reentrancy but this is inconvenient. Remote
> > > draining of the lists is impossible and a workqueue is required and
> > > every task allocation/free must disable then enable interrupts which is
> > > expensive.
> > > 
> > > As preparation for dealing with both of those problems, protect the
> > > lists with a spinlock. The IRQ-unsafe version of the lock is used
> > > because IRQs are already disabled by local_lock_irqsave. spin_trylock
> > > is used in preparation for a time when local_lock could be used instead
> > > of lock_lock_irqsave.
> > 
> > 8c580f60a145 ("mm/page_alloc: protect PCP lists with a spinlock")
> > in next-20220520: I haven't looked up whether that comes from a
> > stable or unstable suburb of akpm's tree.
> > 
> > Mel, the VM_BUG_ON(in_hardirq()) which this adds to free_unref_page_list() 
> > is not valid.  I have no appreciation of how important it is to the whole
> > scheme, but as it stands, it crashes; and when I change it to a warning
> > 
> 
> Thanks Hugh. Sorry for the delay in responding, I was offline for a few
> days. The context where free_unref_page_list is called from IRQ context
> is safe and the VM_BUG_ON can be removed.
> 

Version that has a more appropriate baseline, it'll cause a collision
later in the series but it's trivially resolved.

--8<--
mm/page_alloc: Protect PCP lists with a spinlock -fix

Hugh Dickins reported the following problem;

[  256.167040] WARNING: CPU: 0 PID: 9842 at mm/page_alloc.c:3478 free_unref_page_list+0x92/0x343
[  256.170031] CPU: 0 PID: 9842 Comm: cc1 Not tainted 5.18.0-rc7-n20 #3
[  256.171285] Hardware name: LENOVO 20HQS0EG02/20HQS0EG02, BIOS N1MET54W (1.39 ) 04/16/2019
[  256.172555] RIP: 0010:free_unref_page_list+0x92/0x343
[  256.173820] Code: ff ff 49 8b 44 24 08 4d 89 e0 4c 8d 60 f8 eb b6 48 8b 03 48 39 c3 0f 84 af 02 00 00 65 8b 05 72 7f df 7e a9 00 00 0f 00 74 02 <0f> 0b 9c 41 5d fa 41 0f ba e5 09 73 05 e8 1f 0a f9 ff e8 46 90 7b
[  256.175289] RSP: 0018:ffff88803ec07c80 EFLAGS: 00010006
[  256.176683] RAX: 0000000080010000 RBX: ffff88803ec07cf8 RCX: 000000000000002c
[  256.178122] RDX: 0000000000000000 RSI: ffff88803ec29d28 RDI: 0000000000000040
[  256.179580] RBP: ffff88803ec07cc0 R08: ffff88803ec07cf0 R09: 00000000000a401d
[  256.181031] R10: 0000000000000000 R11: ffff8880101891b8 R12: ffff88803f6dd600
[  256.182501] R13: ffff88803ec07cf8 R14: 000000000000000f R15: 0000000000000000
[  256.183957] FS:  00007ffff7fcfac0(0000) GS:ffff88803ec00000(0000) knlGS:0000000000000000
[  256.185419] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  256.186911] CR2: 0000555555710cdc CR3: 00000000240b4004 CR4: 00000000003706f0
[  256.188395] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  256.189888] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  256.191390] Call Trace:
[  256.192844]  <IRQ>
[  256.194253]  ? __mem_cgroup_uncharge_list+0x4e/0x57
[  256.195715]  release_pages+0x26f/0x27e
[  256.197150]  ? list_add_tail+0x39/0x39
[  256.198603]  pagevec_lru_move_fn+0x95/0xa4

The VM_BUG_ON was added as preparing for a time when the PCP was an IRQ-unsafe
lock. The fundamental limitation is that free_unref_page_list() cannot be called
with the PCP lock held when an IRQ is delivered. At the moment, this is impossible
and even if PCP was an IRQ-unsafe lock, free_unref_page_list is not called from
page allocator context in an unsafe manner. Remove the VM_BUG_ON.

This is a fix to the mmotm patch
mm-page_alloc-protect-pcp-lists-with-a-spinlock.patch

Reported-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e3a6aa97ad7a..52e7fe681483 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3535,8 +3535,6 @@ void free_unref_page_list(struct list_head *list)
 	if (list_empty(list))
 		return;
 
-	VM_BUG_ON(in_hardirq());
-
 	local_lock_irqsave(&pagesets.lock, flags);
 
 	page = lru_to_page(list);
diff mbox series

Patch

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index abe530748de6..8b5757735428 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -385,6 +385,7 @@  enum zone_watermarks {
 
 /* Fields and list protected by pagesets local_lock in page_alloc.c */
 struct per_cpu_pages {
+	spinlock_t lock;	/* Protects lists field */
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dc0fdeb3795c..7be21254087c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -132,6 +132,20 @@  static DEFINE_PER_CPU(struct pagesets, pagesets) __maybe_unused = {
 	.lock = INIT_LOCAL_LOCK(lock),
 };
 
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * On SMP, spin_trylock is sufficient protection.
+ * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ */
+#define pcp_trylock_prepare(flags)	do { } while (0)
+#define pcp_trylock_finish(flag)	do { } while (0)
+#else
+
+/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
+#define pcp_trylock_prepare(flags)	local_irq_save(flags)
+#define pcp_trylock_finish(flags)	local_irq_restore(flags)
+#endif
+
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -3082,15 +3096,22 @@  static int rmqueue_bulk(struct zone *zone, unsigned int order,
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
-	unsigned long flags;
 	int to_drain, batch;
 
-	local_lock_irqsave(&pagesets.lock, flags);
 	batch = READ_ONCE(pcp->batch);
 	to_drain = min(pcp->count, batch);
-	if (to_drain > 0)
+	if (to_drain > 0) {
+		unsigned long flags;
+
+		/*
+		 * free_pcppages_bulk expects IRQs disabled for zone->lock
+		 * so even though pcp->lock is not intended to be IRQ-safe,
+		 * it's needed in this context.
+		 */
+		spin_lock_irqsave(&pcp->lock, flags);
 		free_pcppages_bulk(zone, to_drain, pcp, 0);
-	local_unlock_irqrestore(&pagesets.lock, flags);
+		spin_unlock_irqrestore(&pcp->lock, flags);
+	}
 }
 #endif
 
@@ -3103,16 +3124,17 @@  void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
  */
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
-	unsigned long flags;
 	struct per_cpu_pages *pcp;
 
-	local_lock_irqsave(&pagesets.lock, flags);
-
 	pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
-	if (pcp->count)
-		free_pcppages_bulk(zone, pcp->count, pcp, 0);
+	if (pcp->count) {
+		unsigned long flags;
 
-	local_unlock_irqrestore(&pagesets.lock, flags);
+		/* See drain_zone_pages on why this is disabling IRQs */
+		spin_lock_irqsave(&pcp->lock, flags);
+		free_pcppages_bulk(zone, pcp->count, pcp, 0);
+		spin_unlock_irqrestore(&pcp->lock, flags);
+	}
 }
 
 /*
@@ -3380,18 +3402,30 @@  static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
 	return min(READ_ONCE(pcp->batch) << 2, high);
 }
 
-static void free_unref_page_commit(struct page *page, int migratetype,
-				   unsigned int order)
+/* Returns true if the page was committed to the per-cpu list. */
+static bool free_unref_page_commit(struct page *page, int migratetype,
+				   unsigned int order, bool locked)
 {
 	struct zone *zone = page_zone(page);
 	struct per_cpu_pages *pcp;
 	int high;
 	int pindex;
 	bool free_high;
+	unsigned long __maybe_unused UP_flags;
 
 	__count_vm_event(PGFREE);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pindex = order_to_pindex(migratetype, order);
+
+	if (!locked) {
+		/* Protect against a parallel drain. */
+		pcp_trylock_prepare(UP_flags);
+		if (!spin_trylock(&pcp->lock)) {
+			pcp_trylock_finish(UP_flags);
+			return false;
+		}
+	}
+
 	list_add(&page->pcp_list, &pcp->lists[pindex]);
 	pcp->count += 1 << order;
 
@@ -3409,6 +3443,13 @@  static void free_unref_page_commit(struct page *page, int migratetype,
 
 		free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex);
 	}
+
+	if (!locked) {
+		spin_unlock(&pcp->lock);
+		pcp_trylock_finish(UP_flags);
+	}
+
+	return true;
 }
 
 /*
@@ -3419,6 +3460,7 @@  void free_unref_page(struct page *page, unsigned int order)
 	unsigned long flags;
 	unsigned long pfn = page_to_pfn(page);
 	int migratetype;
+	bool freed_pcp = false;
 
 	if (!free_unref_page_prepare(page, pfn, order))
 		return;
@@ -3440,8 +3482,11 @@  void free_unref_page(struct page *page, unsigned int order)
 	}
 
 	local_lock_irqsave(&pagesets.lock, flags);
-	free_unref_page_commit(page, migratetype, order);
+	freed_pcp = free_unref_page_commit(page, migratetype, order, false);
 	local_unlock_irqrestore(&pagesets.lock, flags);
+
+	if (unlikely(!freed_pcp))
+		free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
 }
 
 /*
@@ -3450,10 +3495,19 @@  void free_unref_page(struct page *page, unsigned int order)
 void free_unref_page_list(struct list_head *list)
 {
 	struct page *page, *next;
+	struct per_cpu_pages *pcp;
+	struct zone *locked_zone;
 	unsigned long flags;
 	int batch_count = 0;
 	int migratetype;
 
+	/*
+	 * An empty list is possible. Check early so that the later
+	 * lru_to_page() does not potentially read garbage.
+	 */
+	if (list_empty(list))
+		return;
+
 	/* Prepare pages for freeing */
 	list_for_each_entry_safe(page, next, list, lru) {
 		unsigned long pfn = page_to_pfn(page);
@@ -3474,8 +3528,33 @@  void free_unref_page_list(struct list_head *list)
 		}
 	}
 
+	/*
+	 * Preparation could have drained the list due to failing to prepare
+	 * or all pages are being isolated.
+	 */
+	if (list_empty(list))
+		return;
+
+	VM_BUG_ON(in_hardirq());
+
 	local_lock_irqsave(&pagesets.lock, flags);
+
+	page = lru_to_page(list);
+	locked_zone = page_zone(page);
+	pcp = this_cpu_ptr(locked_zone->per_cpu_pageset);
+	spin_lock(&pcp->lock);
+
 	list_for_each_entry_safe(page, next, list, lru) {
+		struct zone *zone = page_zone(page);
+
+		/* Different zone, different pcp lock. */
+		if (zone != locked_zone) {
+			spin_unlock(&pcp->lock);
+			locked_zone = zone;
+			pcp = this_cpu_ptr(zone->per_cpu_pageset);
+			spin_lock(&pcp->lock);
+		}
+
 		/*
 		 * Non-isolated types over MIGRATE_PCPTYPES get added
 		 * to the MIGRATE_MOVABLE pcp list.
@@ -3485,18 +3564,33 @@  void free_unref_page_list(struct list_head *list)
 			migratetype = MIGRATE_MOVABLE;
 
 		trace_mm_page_free_batched(page);
-		free_unref_page_commit(page, migratetype, 0);
+
+		/*
+		 * If there is a parallel drain in progress, free to the buddy
+		 * allocator directly. This is expensive as the zone lock will
+		 * be acquired multiple times but if a drain is in progress
+		 * then an expensive operation is already taking place.
+		 *
+		 * TODO: Always false at the moment due to local_lock_irqsave
+		 *       and is preparation for converting to local_lock.
+		 */
+		if (unlikely(!free_unref_page_commit(page, migratetype, 0, true)))
+			free_one_page(page_zone(page), page, page_to_pfn(page), 0, migratetype, FPI_NONE);
 
 		/*
 		 * Guard against excessive IRQ disabled times when we get
 		 * a large list of pages to free.
 		 */
 		if (++batch_count == SWAP_CLUSTER_MAX) {
+			spin_unlock(&pcp->lock);
 			local_unlock_irqrestore(&pagesets.lock, flags);
 			batch_count = 0;
 			local_lock_irqsave(&pagesets.lock, flags);
+			pcp = this_cpu_ptr(locked_zone->per_cpu_pageset);
+			spin_lock(&pcp->lock);
 		}
 	}
+	spin_unlock(&pcp->lock);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 }
 
@@ -3668,9 +3762,28 @@  struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 			int migratetype,
 			unsigned int alloc_flags,
 			struct per_cpu_pages *pcp,
-			struct list_head *list)
+			struct list_head *list,
+			bool locked)
 {
 	struct page *page;
+	unsigned long __maybe_unused UP_flags;
+
+	/*
+	 * spin_trylock is not necessary right now due to due to
+	 * local_lock_irqsave and is a preparation step for
+	 * a conversion to local_lock using the trylock to prevent
+	 * IRQ re-entrancy. If pcp->lock cannot be acquired, the caller
+	 * uses rmqueue_buddy.
+	 *
+	 * TODO: Convert local_lock_irqsave to local_lock.
+	 */
+	if (unlikely(!locked)) {
+		pcp_trylock_prepare(UP_flags);
+		if (!spin_trylock(&pcp->lock)) {
+			pcp_trylock_finish(UP_flags);
+			return NULL;
+		}
+	}
 
 	do {
 		if (list_empty(list)) {
@@ -3691,8 +3804,10 @@  struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 					migratetype, alloc_flags);
 
 			pcp->count += alloced << order;
-			if (unlikely(list_empty(list)))
-				return NULL;
+			if (unlikely(list_empty(list))) {
+				page = NULL;
+				goto out;
+			}
 		}
 
 		page = list_first_entry(list, struct page, lru);
@@ -3700,6 +3815,12 @@  struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 		pcp->count -= 1 << order;
 	} while (check_new_pcp(page, order));
 
+out:
+	if (!locked) {
+		spin_unlock(&pcp->lock);
+		pcp_trylock_finish(UP_flags);
+	}
+
 	return page;
 }
 
@@ -3724,7 +3845,7 @@  static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pcp->free_factor >>= 1;
 	list = &pcp->lists[order_to_pindex(migratetype, order)];
-	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
+	page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list, false);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
@@ -3759,7 +3880,8 @@  struct page *rmqueue(struct zone *preferred_zone,
 				migratetype != MIGRATE_MOVABLE) {
 			page = rmqueue_pcplist(preferred_zone, zone, order,
 					gfp_flags, migratetype, alloc_flags);
-			goto out;
+			if (likely(page))
+				goto out;
 		}
 	}
 
@@ -5326,6 +5448,7 @@  unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 	local_lock_irqsave(&pagesets.lock, flags);
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
+	spin_lock(&pcp->lock);
 
 	while (nr_populated < nr_pages) {
 
@@ -5336,11 +5459,13 @@  unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 		}
 
 		page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
-								pcp, pcp_list);
+							pcp, pcp_list, true);
 		if (unlikely(!page)) {
 			/* Try and get at least one page */
-			if (!nr_populated)
+			if (!nr_populated) {
+				spin_unlock(&pcp->lock);
 				goto failed_irq;
+			}
 			break;
 		}
 		nr_account++;
@@ -5353,6 +5478,7 @@  unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
 		nr_populated++;
 	}
 
+	spin_unlock(&pcp->lock);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 
 	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
@@ -6992,6 +7118,7 @@  static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
 	memset(pcp, 0, sizeof(*pcp));
 	memset(pzstats, 0, sizeof(*pzstats));
 
+	spin_lock_init(&pcp->lock);
 	for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
 		INIT_LIST_HEAD(&pcp->lists[pindex]);