[v2] mm: Stop kswapd early when nothing's waiting for it to free pages
diff mbox series

Message ID 20200221043052.3305-1-sultan@kerneltoast.com
State New
Headers show
Series
  • [v2] mm: Stop kswapd early when nothing's waiting for it to free pages
Related show

Commit Message

Sultan Alsawaf Feb. 21, 2020, 4:30 a.m. UTC
From: Sultan Alsawaf <sultan@kerneltoast.com>

Keeping kswapd running when all the failed allocations that invoked it
are satisfied incurs a high overhead due to unnecessary page eviction
and writeback, as well as spurious VM pressure events to various
registered shrinkers. When kswapd doesn't need to work to make an
allocation succeed anymore, stop it prematurely to save resources.

Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
---
 include/linux/mmzone.h |  1 +
 mm/page_alloc.c        | 17 ++++++++++++++---
 mm/vmscan.c            |  3 ++-
 3 files changed, 17 insertions(+), 4 deletions(-)

Comments

Ira Weiny Feb. 21, 2020, 6:22 p.m. UTC | #1
On Thu, Feb 20, 2020 at 08:30:52PM -0800, Sultan Alsawaf wrote:
> From: Sultan Alsawaf <sultan@kerneltoast.com>
 
[snip]

> @@ -4640,9 +4647,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
>  		goto retry;
>  	}
>  fail:
> -	warn_alloc(gfp_mask, ac->nodemask,
> -			"page allocation failure: order:%u", order);
>  got_pg:

I have no insight into if this is masking a deeper problem or if this fixes
something but doesn't the above result in 'fail' and 'got_pg' being the same
label?

Ira

> +	if (woke_kswapd)
> +		atomic_dec(&pgdat->kswapd_waiters);
> +	if (!page)
> +		warn_alloc(gfp_mask, ac->nodemask,
> +				"page allocation failure: order:%u", order);
>  	return page;
>  }
  
[snip]
Sultan Alsawaf Feb. 21, 2020, 8 p.m. UTC | #2
On Fri, Feb 21, 2020 at 10:22:02AM -0800, Ira Weiny wrote:
> On Thu, Feb 20, 2020 at 08:30:52PM -0800, Sultan Alsawaf wrote:
> > From: Sultan Alsawaf <sultan@kerneltoast.com>
>  
> [snip]
> 
> > @@ -4640,9 +4647,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
> >  		goto retry;
> >  	}
> >  fail:
> > -	warn_alloc(gfp_mask, ac->nodemask,
> > -			"page allocation failure: order:%u", order);
> >  got_pg:
> 
> I have no insight into if this is masking a deeper problem or if this fixes
> something but doesn't the above result in 'fail' and 'got_pg' being the same
> label?
> 
> Ira
> 
> > +	if (woke_kswapd)
> > +		atomic_dec(&pgdat->kswapd_waiters);
> > +	if (!page)
> > +		warn_alloc(gfp_mask, ac->nodemask,
> > +				"page allocation failure: order:%u", order);
> >  	return page;
> >  }
>   
> [snip]

Yes,. This was to reduce the patch delta for the initial submission so it's
clearer what's going on; it can be altered of course to coalesce the labels into
a single one. I typically produce my patches to upstream components to be as
uninvasive as possible to aid in backporting and forward porting in case it's
rejected and I want to keep it for myself.

Sultan

Patch
diff mbox series

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 462f6873905a..23861cdaab7f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -735,6 +735,7 @@  typedef struct pglist_data {
 	unsigned long node_spanned_pages; /* total size of physical page
 					     range, including holes */
 	int node_id;
+	atomic_t kswapd_waiters;
 	wait_queue_head_t kswapd_wait;
 	wait_queue_head_t pfmemalloc_wait;
 	struct task_struct *kswapd;	/* Protected by
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3c4eb750a199..923b994c38c8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4401,6 +4401,8 @@  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	int no_progress_loops;
 	unsigned int cpuset_mems_cookie;
 	int reserve_flags;
+	pg_data_t *pgdat = ac->preferred_zoneref->zone->zone_pgdat;
+	bool woke_kswapd = false;
 
 	/*
 	 * We also sanity check to catch abuse of atomic reserves being used by
@@ -4434,8 +4436,13 @@  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	if (!ac->preferred_zoneref->zone)
 		goto nopage;
 
-	if (alloc_flags & ALLOC_KSWAPD)
+	if (alloc_flags & ALLOC_KSWAPD) {
+		if (!woke_kswapd) {
+			atomic_inc(&pgdat->kswapd_waiters);
+			woke_kswapd = true;
+		}
 		wake_all_kswapds(order, gfp_mask, ac);
+	}
 
 	/*
 	 * The adjusted alloc_flags might result in immediate success, so try
@@ -4640,9 +4647,12 @@  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto retry;
 	}
 fail:
-	warn_alloc(gfp_mask, ac->nodemask,
-			"page allocation failure: order:%u", order);
 got_pg:
+	if (woke_kswapd)
+		atomic_dec(&pgdat->kswapd_waiters);
+	if (!page)
+		warn_alloc(gfp_mask, ac->nodemask,
+				"page allocation failure: order:%u", order);
 	return page;
 }
 
@@ -6711,6 +6721,7 @@  static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 	pgdat_page_ext_init(pgdat);
 	spin_lock_init(&pgdat->lru_lock);
 	lruvec_init(&pgdat->__lruvec);
+	pgdat->kswapd_waiters = (atomic_t)ATOMIC_INIT(0);
 }
 
 static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c05eb9efec07..59d9f3dd14f6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3694,7 +3694,8 @@  static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
 		__fs_reclaim_release();
 		ret = try_to_freeze();
 		__fs_reclaim_acquire();
-		if (ret || kthread_should_stop())
+		if (ret || kthread_should_stop() ||
+		    !atomic_read(&pgdat->kswapd_waiters))
 			break;
 
 		/*