diff mbox series

[3/6] mm/page_alloc: Split out buddy removal code from rmqueue into separate helper

Message ID 20220512085043.5234-4-mgorman@techsingularity.net (mailing list archive)
State New
Headers show
Series Drain remote per-cpu directly v3 | expand

Commit Message

Mel Gorman May 12, 2022, 8:50 a.m. UTC
This is a preparation page to allow the buddy removal code to be reused
in a later patch.

No functional change.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Tested-by: Minchan Kim <minchan@kernel.org>
Acked-by: Minchan Kim <minchan@kernel.org>
---
 mm/page_alloc.c | 87 ++++++++++++++++++++++++++++---------------------
 1 file changed, 50 insertions(+), 37 deletions(-)

Comments

Nicolas Saenz Julienne May 13, 2022, 12:01 p.m. UTC | #1
On Thu, 2022-05-12 at 09:50 +0100, Mel Gorman wrote:
> This is a preparation page to allow the buddy removal code to be
> reused
> in a later patch.
> 
> No functional change.
> 
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> Tested-by: Minchan Kim <minchan@kernel.org>
> Acked-by: Minchan Kim <minchan@kernel.org>
> ---

Reviewed-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>

Thanks,

--
Nicolás Sáenz
Vlastimil Babka May 19, 2022, 9:52 a.m. UTC | #2
On 5/12/22 10:50, Mel Gorman wrote:
> This is a preparation page to allow the buddy removal code to be reused
> in a later patch.
> 
> No functional change.
> 
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> Tested-by: Minchan Kim <minchan@kernel.org>
> Acked-by: Minchan Kim <minchan@kernel.org>

Acked-by: Vlastimil Babka <vbabka@suse.cz>
Qais Yousef May 23, 2022, 4:09 p.m. UTC | #3
On 05/12/22 09:50, Mel Gorman wrote:
> This is a preparation page to allow the buddy removal code to be reused
> in a later patch.
> 
> No functional change.
> 
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> Tested-by: Minchan Kim <minchan@kernel.org>
> Acked-by: Minchan Kim <minchan@kernel.org>
> ---

I see this splat when this patch is applied on 5.10.107 kernel:

	[  132.779332] CPU: 1 PID: 203 Comm: klogd Not tainted 5.10.107-00039-g83962808e276 #28
	[  132.782470] BUG: using __this_cpu_add_return() in preemptible [00000000] code: udhcpc/229
	[  132.787809] Hardware name: ARM Juno development board (r2) (DT)
	[  132.787841] Call trace:
	[  132.787881]  dump_backtrace+0x0/0x2c0
	[  132.787921]  show_stack+0x18/0x28
	[  132.787963]  dump_stack_lvl+0x108/0x150
	[  132.788003]  dump_stack+0x1c/0x58
	[  132.788049]  check_preemption_disabled+0xf4/0x108
	[  132.788095]  __this_cpu_preempt_check+0x20/0x2c
	[  132.788135]  __inc_numa_state+0x3c/0x120
	[  132.788177]  get_page_from_freelist+0xd6c/0x1ac8
	[  132.788220]  __alloc_pages_nodemask+0x224/0x1780
	[  132.797359] caller is __this_cpu_preempt_check+0x20/0x2c
	[  132.803579]  alloc_pages_current+0xb0/0x150
	[  132.803621]  allocate_slab+0x2d0/0x408
	[  132.803662]  ___slab_alloc+0x43c/0x640
	[  132.803704]  __slab_alloc.isra.0+0x70/0xc8
	[  132.803747]  __kmalloc_node_track_caller+0x10c/0x2d8
	[  132.803792]  __kmalloc_reserve.isra.0+0x80/0x160
	[  132.803835]  __alloc_skb+0xd0/0x2a8
	[  132.883893]  alloc_skb_with_frags+0x64/0x2a0
	[  132.888632]  sock_alloc_send_pskb+0x420/0x438
	[  132.893465]  unix_dgram_sendmsg+0x1d4/0x930
	[  132.898112]  __sys_sendto+0x16c/0x230
	[  132.902198]  __arm64_sys_sendto+0x78/0x98
	[  132.906654]  el0_svc_common.constprop.0+0xac/0x278

I could resolve it by applying this patch:

	diff --git a/mm/vmstat.c b/mm/vmstat.c
	index 80c1e0a0f094e..92fb0c08296ef 100644
	--- a/mm/vmstat.c
	+++ b/mm/vmstat.c
	@@ -957,11 +957,11 @@ void __inc_numa_state(struct zone *zone,
		u16 __percpu *p = pcp->vm_numa_stat_diff + item;
		u16 v;

	-       v = __this_cpu_inc_return(*p);
	+       v = this_cpu_inc_return(*p);

		if (unlikely(v > NUMA_STATS_THRESHOLD)) {
			zone_numa_state_add(v, zone, item);
	-               __this_cpu_write(*p, 0);
	+               this_cpu_write(*p, 0);
		}
	 }

AFAICT zone_statistics() no longer protected by the spin_lock_irqsave(), so
preemption no longer disabled.

You need to have CONFIG_NUMA and CONFIG_DEBUG_PREEMPT enabled to reproduce
this.

HTH

Thanks!

--
Qais Yousef
Mel Gorman May 24, 2022, 11:55 a.m. UTC | #4
On Mon, May 23, 2022 at 05:09:21PM +0100, Qais Yousef wrote:
> On 05/12/22 09:50, Mel Gorman wrote:
> > This is a preparation page to allow the buddy removal code to be reused
> > in a later patch.
> > 
> > No functional change.
> > 
> > Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> > Tested-by: Minchan Kim <minchan@kernel.org>
> > Acked-by: Minchan Kim <minchan@kernel.org>
> > ---
> 
> I see this splat when this patch is applied on 5.10.107 kernel:
> 

<SNIP>

> I could resolve it by applying this patch:
> 
> 	diff --git a/mm/vmstat.c b/mm/vmstat.c
> 	index 80c1e0a0f094e..92fb0c08296ef 100644
> 	--- a/mm/vmstat.c
> 	+++ b/mm/vmstat.c
> 	@@ -957,11 +957,11 @@ void __inc_numa_state(struct zone *zone,
> 		u16 __percpu *p = pcp->vm_numa_stat_diff + item;
> 		u16 v;
> 
> 	-       v = __this_cpu_inc_return(*p);
> 	+       v = this_cpu_inc_return(*p);
> 
> 		if (unlikely(v > NUMA_STATS_THRESHOLD)) {
> 			zone_numa_state_add(v, zone, item);
> 	-               __this_cpu_write(*p, 0);
> 	+               this_cpu_write(*p, 0);
> 		}
> 	 }
> 

5.18 does not have __inc_numa_state() so it's likely you are missing
backports, probably f19298b9516c1a031b34b4147773457e3efe743b at minimum.
Qais Yousef May 25, 2022, 11:23 a.m. UTC | #5
On 05/24/22 12:55, Mel Gorman wrote:
> On Mon, May 23, 2022 at 05:09:21PM +0100, Qais Yousef wrote:
> > On 05/12/22 09:50, Mel Gorman wrote:
> > > This is a preparation page to allow the buddy removal code to be reused
> > > in a later patch.
> > > 
> > > No functional change.
> > > 
> > > Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
> > > Tested-by: Minchan Kim <minchan@kernel.org>
> > > Acked-by: Minchan Kim <minchan@kernel.org>
> > > ---
> > 
> > I see this splat when this patch is applied on 5.10.107 kernel:
> > 
> 
> <SNIP>
> 
> > I could resolve it by applying this patch:
> > 
> > 	diff --git a/mm/vmstat.c b/mm/vmstat.c
> > 	index 80c1e0a0f094e..92fb0c08296ef 100644
> > 	--- a/mm/vmstat.c
> > 	+++ b/mm/vmstat.c
> > 	@@ -957,11 +957,11 @@ void __inc_numa_state(struct zone *zone,
> > 		u16 __percpu *p = pcp->vm_numa_stat_diff + item;
> > 		u16 v;
> > 
> > 	-       v = __this_cpu_inc_return(*p);
> > 	+       v = this_cpu_inc_return(*p);
> > 
> > 		if (unlikely(v > NUMA_STATS_THRESHOLD)) {
> > 			zone_numa_state_add(v, zone, item);
> > 	-               __this_cpu_write(*p, 0);
> > 	+               this_cpu_write(*p, 0);
> > 		}
> > 	 }
> > 
> 
> 5.18 does not have __inc_numa_state() so it's likely you are missing
> backports, probably f19298b9516c1a031b34b4147773457e3efe743b at minimum.

Thanks Mel. Sorry it seems I stopped my analysis tad too soon. It seems the
dependency chain is more than that commit. I couldn't backport it on its own.

I just happened to be an accidental user of that kernel (it's an Android 5.10
tree). I did report it there too, but I thought (wrongly) this could benefit
this upstream discussion.

I'll take it there.

Thanks!

--
Qais Yousef
diff mbox series

Patch

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5851ee88a89c..1c4c54503a5d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3622,6 +3622,46 @@  static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
 #endif
 }
 
+static __always_inline
+struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
+			   unsigned int order, unsigned int alloc_flags,
+			   int migratetype)
+{
+	struct page *page;
+	unsigned long flags;
+
+	do {
+		page = NULL;
+		spin_lock_irqsave(&zone->lock, flags);
+		/*
+		 * order-0 request can reach here when the pcplist is skipped
+		 * due to non-CMA allocation context. HIGHATOMIC area is
+		 * reserved for high-order atomic allocation, so order-0
+		 * request should skip it.
+		 */
+		if (order > 0 && alloc_flags & ALLOC_HARDER) {
+			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+			if (page)
+				trace_mm_page_alloc_zone_locked(page, order, migratetype);
+		}
+		if (!page) {
+			page = __rmqueue(zone, order, migratetype, alloc_flags);
+			if (!page) {
+				spin_unlock_irqrestore(&zone->lock, flags);
+				return NULL;
+			}
+		}
+		__mod_zone_freepage_state(zone, -(1 << order),
+					  get_pcppage_migratetype(page));
+		spin_unlock_irqrestore(&zone->lock, flags);
+	} while (check_new_pages(page, order));
+
+	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+	zone_statistics(preferred_zone, zone, 1);
+
+	return page;
+}
+
 /* Remove page from the per-cpu list, caller must protect the list */
 static inline
 struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
@@ -3702,9 +3742,14 @@  struct page *rmqueue(struct zone *preferred_zone,
 			gfp_t gfp_flags, unsigned int alloc_flags,
 			int migratetype)
 {
-	unsigned long flags;
 	struct page *page;
 
+	/*
+	 * We most definitely don't want callers attempting to
+	 * allocate greater than order-1 page units with __GFP_NOFAIL.
+	 */
+	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
+
 	if (likely(pcp_allowed_order(order))) {
 		/*
 		 * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
@@ -3718,38 +3763,10 @@  struct page *rmqueue(struct zone *preferred_zone,
 		}
 	}
 
-	/*
-	 * We most definitely don't want callers attempting to
-	 * allocate greater than order-1 page units with __GFP_NOFAIL.
-	 */
-	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
-
-	do {
-		page = NULL;
-		spin_lock_irqsave(&zone->lock, flags);
-		/*
-		 * order-0 request can reach here when the pcplist is skipped
-		 * due to non-CMA allocation context. HIGHATOMIC area is
-		 * reserved for high-order atomic allocation, so order-0
-		 * request should skip it.
-		 */
-		if (order > 0 && alloc_flags & ALLOC_HARDER) {
-			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
-			if (page)
-				trace_mm_page_alloc_zone_locked(page, order, migratetype);
-		}
-		if (!page) {
-			page = __rmqueue(zone, order, migratetype, alloc_flags);
-			if (!page)
-				goto failed;
-		}
-		__mod_zone_freepage_state(zone, -(1 << order),
-					  get_pcppage_migratetype(page));
-		spin_unlock_irqrestore(&zone->lock, flags);
-	} while (check_new_pages(page, order));
-
-	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-	zone_statistics(preferred_zone, zone, 1);
+	page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
+							migratetype);
+	if (unlikely(!page))
+		return NULL;
 
 out:
 	/* Separate test+clear to avoid unnecessary atomics */
@@ -3760,10 +3777,6 @@  struct page *rmqueue(struct zone *preferred_zone,
 
 	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
 	return page;
-
-failed:
-	spin_unlock_irqrestore(&zone->lock, flags);
-	return NULL;
 }
 
 #ifdef CONFIG_FAIL_PAGE_ALLOC