diff mbox series

[5/6] mm/page_alloc: Limit the number of pages on PCP lists when reclaim is active

Message ID 20210521102826.28552-6-mgorman@techsingularity.net (mailing list archive)
State New, archived
Headers show
Series Calculate pcp->high based on zone sizes and active CPUs | expand

Commit Message

Mel Gorman May 21, 2021, 10:28 a.m. UTC
When kswapd is active then direct reclaim is potentially active. In
either case, it is possible that a zone would be balanced if pages were
not trapped on PCP lists. Instead of draining remote pages, simply limit
the size of the PCP lists while kswapd is active.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 include/linux/mmzone.h |  1 +
 mm/page_alloc.c        | 19 ++++++++++++++++++-
 mm/vmscan.c            | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 1 deletion(-)

Comments

Dave Hansen May 21, 2021, 10:44 p.m. UTC | #1
On 5/21/21 3:28 AM, Mel Gorman wrote:
> +static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
> +{
> +	int high = READ_ONCE(pcp->high);
> +
> +	if (unlikely(!high))
> +		return 0;
> +
> +	if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
> +		return high;
> +
> +	/*
> +	 * If reclaim is active, limit the number of pages that can be
> +	 * stored on pcp lists
> +	 */
> +	return READ_ONCE(pcp->batch) << 2;
> +}

Should there be a sanity check on this?  Let's say we had one of those
weirdo zones with tons of CPUs and a small low_wmark_pages().  Could we
have a case where:

	pcp->high < pcp->batch<<2

and this effectively *raises* nr_pcp_high()?

It's not possible with the current pcp->high calculation, but does
anything prevent it now?
Mel Gorman May 24, 2021, 9:22 a.m. UTC | #2
On Fri, May 21, 2021 at 03:44:49PM -0700, Dave Hansen wrote:
> On 5/21/21 3:28 AM, Mel Gorman wrote:
> > +static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
> > +{
> > +	int high = READ_ONCE(pcp->high);
> > +
> > +	if (unlikely(!high))
> > +		return 0;
> > +
> > +	if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
> > +		return high;
> > +
> > +	/*
> > +	 * If reclaim is active, limit the number of pages that can be
> > +	 * stored on pcp lists
> > +	 */
> > +	return READ_ONCE(pcp->batch) << 2;
> > +}
> 
> Should there be a sanity check on this?  Let's say we had one of those
> weirdo zones with tons of CPUs and a small low_wmark_pages().  Could we
> have a case where:
> 
> 	pcp->high < pcp->batch<<2
> 
> and this effectively *raises* nr_pcp_high()?
> 
> It's not possible with the current pcp->high calculation, but does
> anything prevent it now?

I don't think it would happen as pcp->batch is reduced for small zones
but a sanity check does not hurt so I added one.
diff mbox series

Patch

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 92182e0299b2..a0606239a167 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -647,6 +647,7 @@  enum zone_flags {
 	ZONE_BOOSTED_WATERMARK,		/* zone recently boosted watermarks.
 					 * Cleared when kswapd is woken.
 					 */
+	ZONE_RECLAIM_ACTIVE,		/* kswapd may be scanning the zone. */
 };
 
 static inline unsigned long zone_managed_pages(struct zone *zone)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c3da6401f138..d8f8044781c4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3286,6 +3286,23 @@  static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch)
 	return batch;
 }
 
+static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
+{
+	int high = READ_ONCE(pcp->high);
+
+	if (unlikely(!high))
+		return 0;
+
+	if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags))
+		return high;
+
+	/*
+	 * If reclaim is active, limit the number of pages that can be
+	 * stored on pcp lists
+	 */
+	return READ_ONCE(pcp->batch) << 2;
+}
+
 static void free_unref_page_commit(struct page *page, unsigned long pfn,
 				   int migratetype)
 {
@@ -3297,7 +3314,7 @@  static void free_unref_page_commit(struct page *page, unsigned long pfn,
 	pcp = this_cpu_ptr(zone->per_cpu_pageset);
 	list_add(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
-	high = READ_ONCE(pcp->high);
+	high = nr_pcp_high(pcp, zone);
 	if (pcp->count >= high) {
 		int batch = READ_ONCE(pcp->batch);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5199b9696bab..c3c2100a80b8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3722,6 +3722,38 @@  static bool kswapd_shrink_node(pg_data_t *pgdat,
 	return sc->nr_scanned >= sc->nr_to_reclaim;
 }
 
+/* Page allocator PCP high watermark is lowered if reclaim is active. */
+static inline void
+update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
+{
+	int i;
+	struct zone *zone;
+
+	for (i = 0; i <= highest_zoneidx; i++) {
+		zone = pgdat->node_zones + i;
+
+		if (!managed_zone(zone))
+			continue;
+
+		if (active)
+			set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
+		else
+			clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
+	}
+}
+
+static inline void
+set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
+{
+	update_reclaim_active(pgdat, highest_zoneidx, true);
+}
+
+static inline void
+clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
+{
+	update_reclaim_active(pgdat, highest_zoneidx, false);
+}
+
 /*
  * For kswapd, balance_pgdat() will reclaim pages across a node from zones
  * that are eligible for use by the caller until at least one zone is
@@ -3774,6 +3806,7 @@  static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 	boosted = nr_boost_reclaim;
 
 restart:
+	set_reclaim_active(pgdat, highest_zoneidx);
 	sc.priority = DEF_PRIORITY;
 	do {
 		unsigned long nr_reclaimed = sc.nr_reclaimed;
@@ -3907,6 +3940,8 @@  static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 		pgdat->kswapd_failures++;
 
 out:
+	clear_reclaim_active(pgdat, highest_zoneidx);
+
 	/* If reclaim was boosted, account for the reclaim done in this pass */
 	if (boosted) {
 		unsigned long flags;