@@ -184,7 +184,7 @@ extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
/* Check if a vma is migratable */
extern bool vma_migratable(struct vm_area_struct *vma);
-extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long, int);
extern void mpol_put_task_policy(struct task_struct *);
extern bool numa_demotion_enabled;
@@ -284,7 +284,7 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
#endif
static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
- unsigned long address)
+ unsigned long address, int flags)
{
return -1; /* no node preference */
}
@@ -137,6 +137,9 @@ enum pageflags {
#endif
#ifdef CONFIG_64BIT
PG_arch_2,
+#ifdef CONFIG_NUMA_BALANCING
+ PG_demoted,
+#endif
#endif
__NR_PAGEFLAGS,
@@ -443,6 +446,12 @@ TESTCLEARFLAG(Young, young, PF_ANY)
PAGEFLAG(Idle, idle, PF_ANY)
#endif
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_64BIT)
+TESTPAGEFLAG(Demoted, demoted, PF_NO_TAIL)
+SETPAGEFLAG(Demoted, demoted, PF_NO_TAIL)
+TESTCLEARFLAG(Demoted, demoted, PF_NO_TAIL)
+#endif
+
/*
* PageReported() is used to track reported free pages within the Buddy
* allocator. We can use the non-atomic version of the test and set
@@ -23,6 +23,9 @@ enum page_ext_flags {
PAGE_EXT_YOUNG,
PAGE_EXT_IDLE,
#endif
+#if defined(CONFIG_NUMA_BALANCING) && !defined(CONFIG_64BIT)
+ PAGE_EXT_DEMOTED,
+#endif
};
/*
@@ -8,12 +8,14 @@
*/
#include <linux/sched.h>
+#include <linux/page-flags.h>
#define TNF_MIGRATED 0x01
#define TNF_NO_GROUP 0x02
#define TNF_SHARED 0x04
#define TNF_FAULT_LOCAL 0x08
#define TNF_MIGRATE_FAIL 0x10
+#define TNF_DEMOTED 0x40
#ifdef CONFIG_NUMA_BALANCING
extern void task_numa_fault(int last_node, int node, int pages, int flags);
@@ -21,7 +23,53 @@ extern pid_t task_numa_group_id(struct task_struct *p);
extern void set_numabalancing_state(bool enabled);
extern void task_numa_free(struct task_struct *p, bool final);
extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
- int src_nid, int dst_cpu);
+ int src_nid, int dst_cpu, int flags);
+#ifdef CONFIG_64BIT
+static inline bool page_is_demoted(struct page *page)
+{
+ return PageDemoted(page);
+}
+
+static inline void set_page_demoted(struct page *page)
+{
+ SetPageDemoted(page);
+}
+
+static inline bool test_and_clear_page_demoted(struct page *page)
+{
+ return TestClearPageDemoted(page);
+}
+#else /* !CONFIG_64BIT */
+static inline bool page_is_demoted(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ if (unlikely(!page_ext))
+ return false;
+
+ return test_bit(PAGE_EXT_DEMOTED, &page_ext->flags);
+}
+
+static inline void set_page_demoted(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ if (unlikely(!page_ext))
+ return false;
+
+ return set_bit(PAGE_EXT_DEMOTED, &page_ext->flags);
+}
+
+static inline bool test_and_clear_page_demoted(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ if (unlikely(!page_ext))
+ return false;
+
+ return test_and_clear_bit(PAGE_EXT_DEMOTED, &page_ext->flags);
+}
+#endif /* !CONFIG_64BIT */
#else
static inline void task_numa_fault(int last_node, int node, int pages,
int flags)
@@ -38,10 +86,21 @@ static inline void task_numa_free(struct task_struct *p, bool final)
{
}
static inline bool should_numa_migrate_memory(struct task_struct *p,
- struct page *page, int src_nid, int dst_cpu)
+ struct page *page, int src_nid, int dst_cpu, int flags)
{
return true;
}
+static inline bool page_is_demoted(struct page *page)
+{
+ return false;
+}
+static inline void set_page_demoted(struct page *page)
+{
+}
+static inline bool test_and_clear_page_demoted(struct page *page)
+{
+ return false;
+}
#endif
#endif /* _LINUX_SCHED_NUMA_BALANCING_H */
@@ -35,6 +35,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
PGSTEAL_DIRECT,
PGDEMOTE_KSWAPD,
PGDEMOTE_DIRECT,
+ PGDEMOTE_FILE,
+ PGDEMOTE_ANON,
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_DIRECT_THROTTLE,
@@ -56,9 +58,20 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
NUMA_HINT_FAULTS,
NUMA_HINT_FAULTS_LOCAL,
NUMA_PAGE_MIGRATE,
+ PGPROMOTE_CANDIDATE, /* candidates get selected for promotion */
+ PGPROMOTE_CANDIDATE_DEMOTED,/* promotion candidate that got demoted earlier */
+ PGPROMOTE_CANDIDATE_ANON, /* promotion candidate that are anon */
+ PGPROMOTE_CANDIDATE_FILE, /* promotion candidate that are file */
+ PGPROMOTE_TRIED, /* tried to migrate via NUMA balancing */
+ PGPROMOTE_FILE, /* successfully promoted file pages */
+ PGPROMOTE_ANON, /* successfully promoted anon pages */
#endif
#ifdef CONFIG_MIGRATION
PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
+ PGMIGRATE_DST_NODE_FULL_FAIL, /* failed as the target node is full */
+ PGMIGRATE_NUMA_ISOLATE_FAIL, /* failed in isolating numa page */
+ PGMIGRATE_NOMEM_FAIL, /* failed as no memory left */
+ PGMIGRATE_REFCOUNT_FAIL, /* failed in ref count */
THP_MIGRATION_SUCCESS,
THP_MIGRATION_FAIL,
THP_MIGRATION_SPLIT,
@@ -85,6 +85,13 @@
#define IF_HAVE_PG_ARCH_2(flag,string)
#endif
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_64BIT)
+#define IF_HAVE_PG_DEMOTED(flag, string) ,{1UL << flag, string}
+#else
+#define IF_HAVE_PG_DEMOTED(flag, string)
+#endif
+
+
#define __def_pageflag_names \
{1UL << PG_locked, "locked" }, \
{1UL << PG_waiters, "waiters" }, \
@@ -112,7 +119,8 @@ IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \
IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \
IF_HAVE_PG_IDLE(PG_young, "young" ) \
IF_HAVE_PG_IDLE(PG_idle, "idle" ) \
-IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" )
+IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2") \
+IF_HAVE_PG_DEMOTED(PG_demoted, "demoted")
#define show_page_flags(flags) \
(flags) ? __print_flags(flags, "|", \
@@ -1416,12 +1416,22 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
}
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
- int src_nid, int dst_cpu)
+ int src_nid, int dst_cpu, int flags)
{
struct numa_group *ng = deref_curr_numa_group(p);
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
+ count_vm_numa_event(PGPROMOTE_CANDIDATE);
+
+ if (flags & TNF_DEMOTED)
+ count_vm_numa_event(PGPROMOTE_CANDIDATE_DEMOTED);
+
+ if (page_is_file_lru(page))
+ count_vm_numa_event(PGPROMOTE_CANDIDATE_FILE);
+ else
+ count_vm_numa_event(PGPROMOTE_CANDIDATE_ANON);
+
this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
@@ -51,6 +51,7 @@
#include <linux/kthread.h>
#include <linux/membarrier.h>
#include <linux/migrate.h>
+#include <linux/mm_inline.h>
#include <linux/mmu_context.h>
#include <linux/nmi.h>
#include <linux/proc_fs.h>
@@ -1475,7 +1475,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
* page_table_lock if at all possible
*/
page_locked = trylock_page(page);
- target_nid = mpol_misplaced(page, vma, haddr);
+ target_nid = mpol_misplaced(page, vma, haddr, flags);
if (target_nid == NUMA_NO_NODE) {
/* If the page was locked, there are no parallel migrations */
if (page_locked)
@@ -4131,7 +4131,7 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
*flags |= TNF_FAULT_LOCAL;
}
- return mpol_misplaced(page, vma, addr);
+ return mpol_misplaced(page, vma, addr, *flags);
}
static vm_fault_t do_numa_page(struct vm_fault *vmf)
@@ -2466,7 +2466,7 @@ static void sp_free(struct sp_node *n)
* Policy determination "mimics" alloc_page_vma().
* Called from fault path where we know the vma and faulting address.
*/
-int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr, int flags)
{
struct mempolicy *pol;
struct zoneref *z;
@@ -2477,6 +2477,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
int polnid = NUMA_NO_NODE;
int ret = -1;
+ if (test_and_clear_page_demoted(page))
+ flags |= TNF_DEMOTED;
+
pol = get_vma_policy(vma, addr);
if (!(pol->flags & MPOL_F_MOF))
goto out;
@@ -2526,7 +2529,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
if (pol->flags & MPOL_F_MORON) {
polnid = thisnid;
- if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
+ if (!should_numa_migrate_memory(current, page, curnid, thiscpu, flags))
goto out;
}
@@ -50,6 +50,7 @@
#include <linux/ptrace.h>
#include <linux/oom.h>
#include <linux/memory.h>
+#include <linux/sched/numa_balancing.h>
#include <asm/tlbflush.h>
@@ -264,6 +265,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
} else
#endif
{
+#ifdef CONFIG_NUMA_BALANCING
+ if (page_is_demoted(page) && vma_migratable(vma)) {
+ bool writable = pte_write(pte);
+
+ pte = pte_modify(pte, PAGE_NONE);
+ if (writable)
+ pte = pte_mk_savedwrite(pte);
+ }
+#endif
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
if (PageAnon(new))
@@ -406,6 +416,9 @@ int migrate_page_move_mapping(struct address_space *mapping,
int expected_count = expected_page_refs(mapping, page) + extra_count;
int nr = thp_nr_pages(page);
+ if (page_count(page) != expected_count)
+ count_vm_events(PGMIGRATE_REFCOUNT_FAIL, thp_nr_pages(page));
+
if (!mapping) {
/* Anonymous page without mapping */
if (page_count(page) != expected_count)
@@ -1260,6 +1273,10 @@ static int unmap_and_move(new_page_t get_new_page,
if (!newpage)
return -ENOMEM;
+ /* TODO: check whether Ksm pages can be demoted? */
+ if (reason == MR_DEMOTION && !PageKsm(page))
+ set_page_demoted(newpage);
+
rc = __unmap_and_move(page, newpage, force, mode);
if (rc == MIGRATEPAGE_SUCCESS)
set_page_owner_migrate_reason(newpage, reason);
@@ -1590,6 +1607,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
goto out;
}
nr_failed++;
+ count_vm_events(PGMIGRATE_NOMEM_FAIL, thp_nr_pages(page));
goto out;
case -EAGAIN:
if (is_thp) {
@@ -2141,8 +2159,10 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
/* Avoid migrating to a node that is nearly full */
- if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
+ if (!migrate_balanced_pgdat(pgdat, compound_nr(page))) {
+ count_vm_events(PGMIGRATE_DST_NODE_FULL_FAIL, thp_nr_pages(page));
return 0;
+ }
if (isolate_lru_page(page))
return 0;
@@ -2200,6 +2220,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
pg_data_t *pgdat = NODE_DATA(node);
int isolated;
int nr_remaining;
+ bool is_file;
LIST_HEAD(migratepages);
/*
@@ -2209,18 +2230,15 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
if (is_shared_exec_page(vma, page))
goto out;
- /*
- * Also do not migrate dirty pages as not all filesystems can move
- * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
- */
- if (page_is_file_lru(page) && PageDirty(page))
- goto out;
-
isolated = numamigrate_isolate_page(pgdat, page);
- if (!isolated)
+ if (!isolated) {
+ count_vm_events(PGMIGRATE_NUMA_ISOLATE_FAIL, thp_nr_pages(page));
goto out;
+ }
+ is_file = page_is_file_lru(page);
list_add(&page->lru, &migratepages);
+ count_vm_numa_event(PGPROMOTE_TRIED);
nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
NULL, node, MIGRATE_ASYNC,
MR_NUMA_MISPLACED, NULL);
@@ -2232,8 +2250,13 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
putback_lru_page(page);
}
isolated = 0;
- } else
+ } else {
count_vm_numa_event(NUMA_PAGE_MIGRATE);
+ if (is_file)
+ count_vm_numa_event(PGPROMOTE_FILE);
+ else
+ count_vm_numa_event(PGPROMOTE_ANON);
+ }
BUG_ON(!list_empty(&migratepages));
return isolated;
@@ -2267,13 +2290,16 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
new_page = alloc_pages_node(node,
(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
HPAGE_PMD_ORDER);
- if (!new_page)
+ if (!new_page) {
+ count_vm_events(PGMIGRATE_NOMEM_FAIL, HPAGE_PMD_NR);
goto out_fail;
+ }
prep_transhuge_page(new_page);
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated) {
put_page(new_page);
+ count_vm_events(PGMIGRATE_NUMA_ISOLATE_FAIL, HPAGE_PMD_NR);
goto out_fail;
}
@@ -1142,6 +1142,7 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
int target_nid = next_demotion_node(pgdat->node_id);
unsigned int nr_succeeded;
int err;
+ bool file_lru;
if (list_empty(demote_pages))
return 0;
@@ -1149,6 +1150,8 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
if (target_nid == NUMA_NO_NODE)
return 0;
+ file_lru = page_is_file_lru(lru_to_page(demote_pages));
+
/* Demotion ignores all cpuset and mempolicy settings */
err = migrate_pages(demote_pages, alloc_demote_page, NULL,
target_nid, MIGRATE_ASYNC, MR_DEMOTION,
@@ -1159,6 +1162,11 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
else
__count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
+ if (file_lru)
+ __count_vm_events(PGDEMOTE_FILE, nr_succeeded);
+ else
+ __count_vm_events(PGDEMOTE_ANON, nr_succeeded);
+
return nr_succeeded;
}
@@ -1261,6 +1261,8 @@ const char * const vmstat_text[] = {
"pgsteal_direct",
"pgdemote_kswapd",
"pgdemote_direct",
+ "pgdemote_file",
+ "pgdemote_anon",
"pgscan_kswapd",
"pgscan_direct",
"pgscan_direct_throttle",
@@ -1291,10 +1293,21 @@ const char * const vmstat_text[] = {
"numa_hint_faults",
"numa_hint_faults_local",
"numa_pages_migrated",
+ "pgpromote_candidate",
+ "pgpromote_candidate_demoted",
+ "pgpromote_candidate_anon",
+ "pgpromote_candidate_file",
+ "pgpromote_tried",
+ "pgpromote_file",
+ "pgpromote_anon",
#endif
#ifdef CONFIG_MIGRATION
"pgmigrate_success",
"pgmigrate_fail",
+ "pgmigrate_fail_dst_node_full",
+ "pgmigrate_fail_numa_isolate",
+ "pgmigrate_fail_nomem",
+ "pgmigrate_fail_refcount",
"thp_migration_success",
"thp_migration_fail",
"thp_migration_split",
Promotion and demotion related statistics can help better understand the effectiveness of the page placement mechanism. we want to know among the promoted/demoted pages what is the distribution of anon and file pages. how much of the demoted pages become promotion candidate can give us insight on whether the page placement mechanism is thrashing among the NUMA nodes. we can also use this information to rate limit the migration across the NUMA nodes. Promotion can fail for many reasons, e.g., target node having low memory, page refcount being abnormal, whole system being low on memory etc. Adding counters to track the failure reasons will give the detailed info about why and where it fails, and help debugging the system. To track the demoted pages, PG_demoted bit is introduced for pages that get demoted. Upon demotion, PG_demoted bit is set in thepage flag. upon promotion, the bit gets reset for that page. promotion related statistics: ============================= pgpromote_candidate - candidates that get selected for promotion pgpromote_candidate_demoted - promotion candidate that got demoted earlier pgpromote_candidate_anon - promotion candidate that are anon pgpromote_candidate_file - promotion candidate that are file pgpromote_tried - pages that had a try to migrate via NUMA Balancing pgpromote_file- successfully promoted file pages pgpromote_anon - successfully promoted anon pages promotion failure related statistics: ===================================== pgmigrate_fail_dst_node_full - failed as the target node is full pgmigrate_fail_numa_isolate - failed in isolating numa page pgmigrate_fail_nomem - failed as no memory left in the system pgmigrate_fail_refcount - failed as ref count mismatched demotion related statistics: ============================ pgdemote_file - successfully demoted file pages pgdemote_anon - successfully demoted anon pages Signed-off-by: Hasan Al Maruf <hasanalmaruf@fb.com> --- include/linux/mempolicy.h | 4 +- include/linux/page-flags.h | 9 ++++ include/linux/page_ext.h | 3 ++ include/linux/sched/numa_balancing.h | 63 +++++++++++++++++++++++++++- include/linux/vm_event_item.h | 13 ++++++ include/trace/events/mmflags.h | 10 ++++- kernel/sched/fair.c | 12 +++++- kernel/sched/sched.h | 1 + mm/huge_memory.c | 2 +- mm/memory.c | 2 +- mm/mempolicy.c | 7 +++- mm/migrate.c | 48 ++++++++++++++++----- mm/vmscan.c | 8 ++++ mm/vmstat.c | 13 ++++++ 14 files changed, 174 insertions(+), 21 deletions(-)