[3/5] Decouple reclaim and allocation for toptier nodes

Message ID	b45b9bf7cd3e21bca61d82dcd1eb692cd32c122c.1637778851.git.hasanalmaruf@fb.com (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> From: Hasan Al Maruf <hasan3050@gmail.com> To: dave.hansen@linux.intel.com, ying.huang@intel.com, yang.shi@linux.alibaba.com, mgorman@techsingularity.net, riel@surriel.com, hannes@cmpxchg.org Cc: linux-mm@kvack.org, linux-kernel@vger.kernel.org Subject: [PATCH 3/5] Decouple reclaim and allocation for toptier nodes Date: Wed, 24 Nov 2021 13:58:28 -0500 Message-Id: <b45b9bf7cd3e21bca61d82dcd1eb692cd32c122c.1637778851.git.hasanalmaruf@fb.com> In-Reply-To: <cover.1637778851.git.hasanalmaruf@fb.com> References: <cover.1637778851.git.hasanalmaruf@fb.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	Transparent Page Placement for Tiered-Memory \| expand [0/5] Transparent Page Placement for Tiered-Memory [1/5] Promotion and demotion related statistics [2/5] NUMA balancing for tiered-memory system [3/5] Decouple reclaim and allocation for toptier nodes [4/5] Reclaim to satisfy WMARK_DEMOTE on toptier nodes [5/5] active LRU-based promotion to avoid ping-pong

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 586cd4b86428..027b1f31fec1 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -74,6 +74,7 @@ Currently, these files are in /proc/sys/vm: - vfs_cache_pressure - watermark_boost_factor - watermark_scale_factor +- demote_scale_factor - zone_reclaim_mode @@ -961,6 +962,17 @@ that the number of free pages kswapd maintains for latency reasons is too small for the allocation bursts occurring in the system. This knob can then be used to tune kswapd aggressiveness accordingly. +demote_scale_factor +=================== + +This factor controls when kswapd wakes up to demote pages from toptier +nodes. It defines the amount of memory left in a toptier node/system +before kswapd is woken up and how much memory needs to be free from those +nodes before kswapd goes back to sleep. + +The unit is in fractions of 10,000. The default value of 200 means if there +are less than 2% of free toptier memory in a node/system, we will start to +demote pages from that node. zone_reclaim_mode ================= diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index ab57b6a82e0a..0a76ac103b17 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -145,6 +145,7 @@ extern void numa_default_policy(void); extern void numa_policy_init(void); extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); +extern void check_toptier_balanced(void); extern int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, @@ -299,6 +300,10 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp) return NULL; } +static inline void check_toptier_balanced(void) +{ +} + #define numa_demotion_enabled false #define numa_promotion_tiered_enabled false #endif /* CONFIG_NUMA */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 9a226787464e..4748e57b7c68 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3153,6 +3153,10 @@ static inline bool debug_guardpage_enabled(void) { return false; } static inline bool page_is_guard(struct page *page) { return false; } #endif /* CONFIG_DEBUG_PAGEALLOC */ +#ifdef CONFIG_MIGRATION +extern int demote_scale_factor; +#endif + #if MAX_NUMNODES > 1 void __init setup_nr_node_ids(void); #else diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 47946cec7584..070284feac03 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -329,12 +329,14 @@ enum zone_watermarks { WMARK_MIN, WMARK_LOW, WMARK_HIGH, + WMARK_DEMOTE, NR_WMARK }; #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) +#define demote_wmark_pages(z) (z->_watermark[WMARK_DEMOTE] + z->watermark_boost) #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) struct per_cpu_pages { @@ -884,6 +886,7 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx); +bool pgdat_toptier_balanced(pg_data_t *pgdat, int order, int zone_idx); /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. @@ -1011,6 +1014,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +int demote_scale_factor_sysctl_handler(struct ctl_table *, int, void __user *, + size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 45e39832a2b1..6cada31f7265 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -21,6 +21,8 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include "sched.h" +#include <trace/events/sched.h> +#include <linux/mempolicy.h> /* * Targeted preemption latency for CPU-bound tasks: @@ -10802,6 +10804,7 @@ void trigger_load_balance(struct rq *rq) raise_softirq(SCHED_SOFTIRQ); nohz_balancer_kick(rq); + check_toptier_balanced(); } static void rq_online_fair(struct rq *rq) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 751b52062eb4..7d2995045a94 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -112,6 +112,7 @@ static int sixty = 60; #endif static int __maybe_unused neg_one = -1; +static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused three = 3; static int __maybe_unused four = 4; @@ -121,8 +122,8 @@ static unsigned long long_max = LONG_MAX; static int one_hundred = 100; static int two_hundred = 200; static int one_thousand = 1000; -#ifdef CONFIG_PRINTK static int ten_thousand = 10000; +#ifdef CONFIG_PRINTK #endif #ifdef CONFIG_PERF_EVENTS static int six_hundred_forty_kb = 640 * 1024; @@ -3000,6 +3001,15 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &one_thousand, }, + { + .procname = "demote_scale_factor", + .data = &demote_scale_factor, + .maxlen = sizeof(demote_scale_factor), + .mode = 0644, + .proc_handler = demote_scale_factor_sysctl_handler, + .extra1 = &one, + .extra2 = &ten_thousand, + }, { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 580e76ae58e6..ba9b1322bd48 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1042,6 +1042,29 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } +void check_toptier_balanced(void) +{ + int nid; + int balanced; + + if (!numa_promotion_tiered_enabled) + return; + + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + + if (!node_is_toptier(nid)) + continue; + + balanced = pgdat_toptier_balanced(pgdat, 0, ZONE_MOVABLE); + if (!balanced) { + pgdat->kswapd_order = 0; + pgdat->kswapd_highest_zoneidx = ZONE_NORMAL; + wakeup_kswapd(pgdat->node_zones + ZONE_NORMAL, 0, 1, ZONE_NORMAL); + } + } +} + #ifdef CONFIG_MIGRATION /* * page migration, thp tail pages can be passed. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5f1dd104cf8e..8638e24e1b2f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3599,7 +3599,8 @@ struct page *rmqueue(struct zone *preferred_zone, if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) { clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); wakeup_kswapd(zone, 0, 0, zone_idx(zone)); - } + } else if (!pgdat_toptier_balanced(zone->zone_pgdat, order, zone_idx(zone))) + wakeup_kswapd(zone, 0, 0, zone_idx(zone)); VM_BUG_ON_PAGE(page && bad_range(zone, page), page); return page; @@ -8047,6 +8048,22 @@ static void __setup_per_zone_wmarks(void) zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; + if (numa_promotion_tiered_enabled) { + tmp = mult_frac(zone_managed_pages(zone), demote_scale_factor, 10000); + + /* + * Clamp demote watermark between twice high watermark + * and max managed pages. + */ + if (tmp < 2 * zone->_watermark[WMARK_HIGH]) + tmp = 2 * zone->_watermark[WMARK_HIGH]; + if (tmp > zone_managed_pages(zone)) + tmp = zone_managed_pages(zone); + zone->_watermark[WMARK_DEMOTE] = tmp; + + zone->watermark_boost = 0; + } + spin_unlock_irqrestore(&zone->lock, flags); } @@ -8163,6 +8180,21 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, return 0; } +int demote_scale_factor_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write) + setup_per_zone_wmarks(); + + return 0; +} + #ifdef CONFIG_NUMA static void setup_min_unmapped_ratio(void) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 47c868d2ecfd..c39b217effa9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -41,6 +41,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/memcontrol.h> +#include <linux/mempolicy.h> #include <linux/migrate.h> #include <linux/delayacct.h> #include <linux/sysctl.h> @@ -190,6 +191,7 @@ static void set_task_reclaim_state(struct task_struct *task, static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); +int demote_scale_factor = 200; #ifdef CONFIG_MEMCG /* @@ -3598,6 +3600,30 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) return false; } +bool pgdat_toptier_balanced(pg_data_t *pgdat, int order, int zone_idx) +{ + unsigned long mark; + struct zone *zone; + + if (!node_is_toptier(pgdat->node_id) || + !numa_promotion_tiered_enabled || + order > 0 || zone_idx < ZONE_NORMAL) { + return true; + } + + zone = pgdat->node_zones + ZONE_NORMAL; + + if (!managed_zone(zone)) + return true; + + mark = min(demote_wmark_pages(zone), zone_managed_pages(zone)); + + if (zone_page_state(zone, NR_FREE_PAGES) < mark) + return false; + + return true; +} + /* Clear pgdat state for congested, dirty or under writeback. */ static void clear_pgdat_congested(pg_data_t *pgdat) { diff --git a/mm/vmstat.c b/mm/vmstat.c index cda2505bb21f..4309f79a6132 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -28,6 +28,7 @@ #include <linux/mm_inline.h> #include <linux/page_ext.h> #include <linux/page_owner.h> +#include <linux/migrate.h> #include "internal.h" @@ -1649,7 +1650,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { int i; - seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); + seq_printf(m, "Node %d, zone %8s, toptier %d next_demotion_node %d", + pgdat->node_id, zone->name, node_is_toptier(pgdat->node_id), + next_demotion_node(pgdat->node_id)); if (is_zone_first_populated(pgdat, zone)) { seq_printf(m, "\n per-node stats"); for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { @@ -1666,6 +1669,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" + "\n demote %lu" "\n spanned %lu" "\n present %lu" "\n managed %lu" @@ -1674,6 +1678,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), + node_is_toptier(pgdat->node_id) ? demote_wmark_pages(zone) : 0, zone->spanned_pages, zone->present_pages, zone_managed_pages(zone),

[3/5] Decouple reclaim and allocation for toptier nodes

Commit Message

Patch