@@ -74,6 +74,7 @@ Currently, these files are in /proc/sys/vm:
- vfs_cache_pressure
- watermark_boost_factor
- watermark_scale_factor
+- demote_scale_factor
- zone_reclaim_mode
@@ -961,6 +962,17 @@ that the number of free pages kswapd maintains for latency reasons is
too small for the allocation bursts occurring in the system. This knob
can then be used to tune kswapd aggressiveness accordingly.
+demote_scale_factor
+===================
+
+This factor controls when kswapd wakes up to demote pages from toptier
+nodes. It defines the amount of memory left in a toptier node/system
+before kswapd is woken up and how much memory needs to be free from those
+nodes before kswapd goes back to sleep.
+
+The unit is in fractions of 10,000. The default value of 200 means if there
+are less than 2% of free toptier memory in a node/system, we will start to
+demote pages from that node.
zone_reclaim_mode
=================
@@ -145,6 +145,7 @@ extern void numa_default_policy(void);
extern void numa_policy_init(void);
extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new);
extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
+extern void check_toptier_balanced(void);
extern int huge_node(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags,
@@ -299,6 +300,10 @@ static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
return NULL;
}
+static inline void check_toptier_balanced(void)
+{
+}
+
#define numa_demotion_enabled false
#define numa_promotion_tiered_enabled false
#endif /* CONFIG_NUMA */
@@ -3153,6 +3153,10 @@ static inline bool debug_guardpage_enabled(void) { return false; }
static inline bool page_is_guard(struct page *page) { return false; }
#endif /* CONFIG_DEBUG_PAGEALLOC */
+#ifdef CONFIG_MIGRATION
+extern int demote_scale_factor;
+#endif
+
#if MAX_NUMNODES > 1
void __init setup_nr_node_ids(void);
#else
@@ -329,12 +329,14 @@ enum zone_watermarks {
WMARK_MIN,
WMARK_LOW,
WMARK_HIGH,
+ WMARK_DEMOTE,
NR_WMARK
};
#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
+#define demote_wmark_pages(z) (z->_watermark[WMARK_DEMOTE] + z->watermark_boost)
#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
struct per_cpu_pages {
@@ -884,6 +886,7 @@ bool zone_watermark_ok(struct zone *z, unsigned int order,
unsigned int alloc_flags);
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
unsigned long mark, int highest_zoneidx);
+bool pgdat_toptier_balanced(pg_data_t *pgdat, int order, int zone_idx);
/*
* Memory initialization context, use to differentiate memory added by
* the platform statically or via memory hotplug interface.
@@ -1011,6 +1014,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *,
loff_t *);
int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *,
size_t *, loff_t *);
+int demote_scale_factor_sysctl_handler(struct ctl_table *, int, void __user *,
+ size_t *, loff_t *);
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *,
size_t *, loff_t *);
@@ -21,6 +21,8 @@
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
#include "sched.h"
+#include <trace/events/sched.h>
+#include <linux/mempolicy.h>
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -10802,6 +10804,7 @@ void trigger_load_balance(struct rq *rq)
raise_softirq(SCHED_SOFTIRQ);
nohz_balancer_kick(rq);
+ check_toptier_balanced();
}
static void rq_online_fair(struct rq *rq)
@@ -112,6 +112,7 @@ static int sixty = 60;
#endif
static int __maybe_unused neg_one = -1;
+static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused three = 3;
static int __maybe_unused four = 4;
@@ -121,8 +122,8 @@ static unsigned long long_max = LONG_MAX;
static int one_hundred = 100;
static int two_hundred = 200;
static int one_thousand = 1000;
-#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
+#ifdef CONFIG_PRINTK
#endif
#ifdef CONFIG_PERF_EVENTS
static int six_hundred_forty_kb = 640 * 1024;
@@ -3000,6 +3001,15 @@ static struct ctl_table vm_table[] = {
.extra1 = SYSCTL_ONE,
.extra2 = &one_thousand,
},
+ {
+ .procname = "demote_scale_factor",
+ .data = &demote_scale_factor,
+ .maxlen = sizeof(demote_scale_factor),
+ .mode = 0644,
+ .proc_handler = demote_scale_factor_sysctl_handler,
+ .extra1 = &one,
+ .extra2 = &ten_thousand,
+ },
{
.procname = "percpu_pagelist_fraction",
.data = &percpu_pagelist_fraction,
@@ -1042,6 +1042,29 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
return err;
}
+void check_toptier_balanced(void)
+{
+ int nid;
+ int balanced;
+
+ if (!numa_promotion_tiered_enabled)
+ return;
+
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ if (!node_is_toptier(nid))
+ continue;
+
+ balanced = pgdat_toptier_balanced(pgdat, 0, ZONE_MOVABLE);
+ if (!balanced) {
+ pgdat->kswapd_order = 0;
+ pgdat->kswapd_highest_zoneidx = ZONE_NORMAL;
+ wakeup_kswapd(pgdat->node_zones + ZONE_NORMAL, 0, 1, ZONE_NORMAL);
+ }
+ }
+}
+
#ifdef CONFIG_MIGRATION
/*
* page migration, thp tail pages can be passed.
@@ -3599,7 +3599,8 @@ struct page *rmqueue(struct zone *preferred_zone,
if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
- }
+ } else if (!pgdat_toptier_balanced(zone->zone_pgdat, order, zone_idx(zone)))
+ wakeup_kswapd(zone, 0, 0, zone_idx(zone));
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
@@ -8047,6 +8048,22 @@ static void __setup_per_zone_wmarks(void)
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+ if (numa_promotion_tiered_enabled) {
+ tmp = mult_frac(zone_managed_pages(zone), demote_scale_factor, 10000);
+
+ /*
+ * Clamp demote watermark between twice high watermark
+ * and max managed pages.
+ */
+ if (tmp < 2 * zone->_watermark[WMARK_HIGH])
+ tmp = 2 * zone->_watermark[WMARK_HIGH];
+ if (tmp > zone_managed_pages(zone))
+ tmp = zone_managed_pages(zone);
+ zone->_watermark[WMARK_DEMOTE] = tmp;
+
+ zone->watermark_boost = 0;
+ }
+
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -8163,6 +8180,21 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
+int demote_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ if (write)
+ setup_per_zone_wmarks();
+
+ return 0;
+}
+
#ifdef CONFIG_NUMA
static void setup_min_unmapped_ratio(void)
{
@@ -41,6 +41,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
+#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
@@ -190,6 +191,7 @@ static void set_task_reclaim_state(struct task_struct *task,
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
+int demote_scale_factor = 200;
#ifdef CONFIG_MEMCG
/*
@@ -3598,6 +3600,30 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
return false;
}
+bool pgdat_toptier_balanced(pg_data_t *pgdat, int order, int zone_idx)
+{
+ unsigned long mark;
+ struct zone *zone;
+
+ if (!node_is_toptier(pgdat->node_id) ||
+ !numa_promotion_tiered_enabled ||
+ order > 0 || zone_idx < ZONE_NORMAL) {
+ return true;
+ }
+
+ zone = pgdat->node_zones + ZONE_NORMAL;
+
+ if (!managed_zone(zone))
+ return true;
+
+ mark = min(demote_wmark_pages(zone), zone_managed_pages(zone));
+
+ if (zone_page_state(zone, NR_FREE_PAGES) < mark)
+ return false;
+
+ return true;
+}
+
/* Clear pgdat state for congested, dirty or under writeback. */
static void clear_pgdat_congested(pg_data_t *pgdat)
{
@@ -28,6 +28,7 @@
#include <linux/mm_inline.h>
#include <linux/page_ext.h>
#include <linux/page_owner.h>
+#include <linux/migrate.h>
#include "internal.h"
@@ -1649,7 +1650,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
int i;
- seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+ seq_printf(m, "Node %d, zone %8s, toptier %d next_demotion_node %d",
+ pgdat->node_id, zone->name, node_is_toptier(pgdat->node_id),
+ next_demotion_node(pgdat->node_id));
if (is_zone_first_populated(pgdat, zone)) {
seq_printf(m, "\n per-node stats");
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
@@ -1666,6 +1669,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n min %lu"
"\n low %lu"
"\n high %lu"
+ "\n demote %lu"
"\n spanned %lu"
"\n present %lu"
"\n managed %lu"
@@ -1674,6 +1678,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
+ node_is_toptier(pgdat->node_id) ? demote_wmark_pages(zone) : 0,
zone->spanned_pages,
zone->present_pages,
zone_managed_pages(zone),
With a tight memory constraint, we need to proactively keep some free memory in toptier node, such that 1) new allocation which is mainly for request processing can be directly put in the toptier node and 2) toptier node is able to accept hot pages promoted from non-toptier node. To achieve that, we decouple the reclamation and allocation mechanism, i.e. reclamation gets triggered at a different watermark -- WMARK_DEMOTE, while allocation checks for the traditional WMARK_HIGH. In this way, toptier nodes can maintain some free space to accept both new allocation and promotion from non-toptier nodes. On each toptier memory node, kswapd daemon is woken up to demote memory when free memory on the node falls below the following fraction demote_scale_factor/10000 The default value of demote_scale_factor is 200 , (i.e. 2%) so kswapd will be woken up when available free memory on a node falls below 2%. The demote_scale_factor can be adjusted higher if we need kswapd to keep more free memory around by updating the sysctl variable /proc/sys/vm/demote_scale_factor Signed-off-by: Hasan Al Maruf <hasanalmaruf@fb.com> --- Documentation/admin-guide/sysctl/vm.rst | 12 +++++++++ include/linux/mempolicy.h | 5 ++++ include/linux/mm.h | 4 +++ include/linux/mmzone.h | 5 ++++ kernel/sched/fair.c | 3 +++ kernel/sysctl.c | 12 ++++++++- mm/mempolicy.c | 23 +++++++++++++++++ mm/page_alloc.c | 34 ++++++++++++++++++++++++- mm/vmscan.c | 26 +++++++++++++++++++ mm/vmstat.c | 7 ++++- 10 files changed, 128 insertions(+), 3 deletions(-)