@@ -247,6 +247,9 @@ enum node_stat_item {
NR_DIRTIED, /* page dirtyings since bootup */
NR_WRITTEN, /* page writings since bootup */
NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */
+#ifdef CONFIG_NUMA_BALANCING
+ NUMA_TRY_MIGRATE, /* pages to try to migrate via NUMA balancing */
+#endif
NR_VM_NODE_STAT_ITEMS
};
@@ -766,6 +769,10 @@ typedef struct pglist_data {
unsigned long split_queue_len;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ unsigned long autonuma_jiffies;
+ unsigned long autonuma_try_migrate;
+#endif
/* Fields commonly accessed by the page reclaim scanner */
struct lruvec lruvec;
@@ -42,6 +42,7 @@ extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
+extern unsigned int sysctl_numa_balancing_rate_limit;
#ifdef CONFIG_SCHED_DEBUG
extern __read_mostly unsigned int sysctl_sched_migration_cost;
@@ -1047,6 +1047,12 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+/*
+ * Restrict the NUMA migration per second in MB for each target node
+ * if no enough free space in target node
+ */
+unsigned int sysctl_numa_balancing_rate_limit;
+
struct numa_group {
refcount_t refcount;
@@ -1397,6 +1403,44 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
return 1000 * faults / total_faults;
}
+static bool pgdat_free_space_enough(struct pglist_data *pgdat)
+{
+ int z;
+ unsigned long rate_limit;
+
+ rate_limit = sysctl_numa_balancing_rate_limit << (20 - PAGE_SHIFT);
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone) + rate_limit * 2,
+ ZONE_MOVABLE, 0))
+ return true;
+ }
+ return false;
+}
+
+static bool numa_migration_check_rate_limit(struct pglist_data *pgdat,
+ unsigned long rate_limit, int nr)
+{
+ unsigned long try_migrate;
+ unsigned long now = jiffies, last_jiffies;
+
+ mod_node_page_state(pgdat, NUMA_TRY_MIGRATE, nr);
+ try_migrate = node_page_state(pgdat, NUMA_TRY_MIGRATE);
+ last_jiffies = pgdat->autonuma_jiffies;
+ if (now > last_jiffies + HZ &&
+ cmpxchg(&pgdat->autonuma_jiffies, last_jiffies, now) ==
+ last_jiffies)
+ pgdat->autonuma_try_migrate = try_migrate;
+ if (try_migrate - pgdat->autonuma_try_migrate > rate_limit)
+ return false;
+ return true;
+}
+
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int src_nid, int dst_cpu)
{
@@ -1404,6 +1448,25 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
+ /*
+ * If memory tiering mode is enabled, will try promote pages
+ * in slow memory node to fast memory node.
+ */
+ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
+ next_promotion_node(src_nid) != -1) {
+ struct pglist_data *pgdat;
+ unsigned long rate_limit;
+
+ pgdat = NODE_DATA(dst_nid);
+ if (pgdat_free_space_enough(pgdat))
+ return true;
+
+ rate_limit = sysctl_numa_balancing_rate_limit <<
+ (20 - PAGE_SHIFT);
+ return numa_migration_check_rate_limit(pgdat, rate_limit,
+ hpage_nr_pages(page));
+ }
+
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
@@ -421,6 +421,14 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &one,
},
+ {
+ .procname = "numa_balancing_rate_limit_mbps",
+ .data = &sysctl_numa_balancing_rate_limit,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
{
.procname = "numa_balancing",
.data = &sysctl_numa_balancing_mode,
@@ -1198,6 +1198,9 @@ const char * const vmstat_text[] = {
"nr_dirtied",
"nr_written",
"nr_kernel_misc_reclaimable",
+#ifdef CONFIG_NUMA_BALANCING
+ "numa_try_migrate",
+#endif
/* enum writeback_stat_item counters */
"nr_dirty_threshold",