diff mbox series

[04/10] mm: numa: promote pages to DRAM when it is accessed twice

Message ID 1553316275-21985-5-git-send-email-yang.shi@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series Another Approach to Use PMEM as NUMA Node | expand

Commit Message

Yang Shi March 23, 2019, 4:44 a.m. UTC
NUMA balancing would promote the pages to DRAM once it is accessed, but
it might be just one off access.  To reduce migration thrashing and
memory bandwidth pressure, introduce PG_promote flag to mark promote
candidate.  The page will be promoted to DRAM when it is accessed twice.
This might be a good way to filter out those one-off access pages.

PG_promote flag will be inherited by tail pages when THP gets split.
But, it will not be copied to the new page once the migration is done.

This approach is not definitely the optimal one to distinguish the
hot or cold pages.  It may need much more sophisticated algorithm to
distinguish hot or cold pages accurately.  Kernel may be not the good
place to implement such algorithm considering the complexity and potential
overhead.  But, kernel may still need such capability.

With NUMA balancing the whole workingset of the process may end up being
promoted to DRAM finally.  It depends on the page reclaim to demote
inactive pages to PMEM implemented by the following patch.

Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
---
 include/linux/page-flags.h     |  4 ++++
 include/trace/events/mmflags.h |  3 ++-
 mm/huge_memory.c               | 10 ++++++++++
 mm/memory.c                    |  8 ++++++++
 4 files changed, 24 insertions(+), 1 deletion(-)

Comments

kernel test robot March 29, 2019, 12:31 a.m. UTC | #1
Hi Yang,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.1-rc2 next-20190328]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Yang-Shi/Another-Approach-to-Use-PMEM-as-NUMA-Node/20190326-034920
config: i386-randconfig-x076-201912 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   In file included from include/linux/memcontrol.h:29:0,
                    from include/linux/swap.h:9,
                    from include/linux/suspend.h:5,
                    from arch/x86/kernel/asm-offsets.c:13:
>> include/linux/mm.h:862:2: error: #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
    #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
     ^~~~~
   make[2]: *** [arch/x86/kernel/asm-offsets.s] Error 1
   make[2]: Target '__build' not remade because of errors.
   make[1]: *** [prepare0] Error 2
   make[1]: Target 'prepare' not remade because of errors.
   make: *** [sub-make] Error 2

vim +862 include/linux/mm.h

348f8b6c4 Dave Hansen       2005-06-23  860  
9223b4190 Christoph Lameter 2008-04-28  861  #if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
9223b4190 Christoph Lameter 2008-04-28 @862  #error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
348f8b6c4 Dave Hansen       2005-06-23  863  #endif
348f8b6c4 Dave Hansen       2005-06-23  864  

:::::: The code at line 862 was first introduced by commit
:::::: 9223b4190fa1297a59f292f3419fc0285321d0ea pageflags: get rid of FLAGS_RESERVED

:::::: TO: Christoph Lameter <clameter@sgi.com>
:::::: CC: Linus Torvalds <torvalds@linux-foundation.org>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
diff mbox series

Patch

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 9f8712a..2d53166 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -131,6 +131,7 @@  enum pageflags {
 	PG_young,
 	PG_idle,
 #endif
+	PG_promote,		/* Promote candidate for NUMA balancing */
 	__NR_PAGEFLAGS,
 
 	/* Filesystems */
@@ -348,6 +349,9 @@  static inline void page_init_poison(struct page *page, size_t size)
 PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
 	TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
 
+PAGEFLAG(Promote, promote, PF_ANY) __SETPAGEFLAG(Promote, promote, PF_ANY)
+	__CLEARPAGEFLAG(Promote, promote, PF_ANY)
+
 /*
  * Only test-and-set exist for PG_writeback.  The unconditional operators are
  * risky: they bypass page accounting.
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index a1675d4..f13c2a1 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -100,7 +100,8 @@ 
 	{1UL << PG_mappedtodisk,	"mappedtodisk"	},		\
 	{1UL << PG_reclaim,		"reclaim"	},		\
 	{1UL << PG_swapbacked,		"swapbacked"	},		\
-	{1UL << PG_unevictable,		"unevictable"	}		\
+	{1UL << PG_unevictable,		"unevictable"	},		\
+	{1UL << PG_promote,		"promote"	}		\
 IF_HAVE_PG_MLOCK(PG_mlocked,		"mlocked"	)		\
 IF_HAVE_PG_UNCACHED(PG_uncached,	"uncached"	)		\
 IF_HAVE_PG_HWPOISON(PG_hwpoison,	"hwpoison"	)		\
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 404acdc..8268a3c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1589,6 +1589,15 @@  vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 					      haddr + HPAGE_PMD_SIZE);
 	}
 
+	/* Promote page to DRAM when referenced twice */
+	if (!(node_isset(page_nid, def_alloc_nodemask)) &&
+	    !PagePromote(page)) {
+		SetPagePromote(page);
+		put_page(page);
+		page_nid = -1;
+		goto clear_pmdnuma;
+	}
+
 	/*
 	 * Migrate the THP to the requested node, returns with page unlocked
 	 * and access rights restored.
@@ -2396,6 +2405,7 @@  static void __split_huge_page_tail(struct page *head, int tail,
 			 (1L << PG_workingset) |
 			 (1L << PG_locked) |
 			 (1L << PG_unevictable) |
+			 (1L << PG_promote) |
 			 (1L << PG_dirty)));
 
 	/* ->mapping in first tail page is compound_mapcount */
diff --git a/mm/memory.c b/mm/memory.c
index 47fe250..2494c11 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3680,6 +3680,14 @@  static vm_fault_t do_numa_page(struct vm_fault *vmf)
 		goto out;
 	}
 
+	/* Promote the non-DRAM page when it is referenced twice */
+	if (!(node_isset(page_nid, def_alloc_nodemask)) &&
+	    !PagePromote(page)) {
+		SetPagePromote(page);
+		put_page(page);
+		goto out;
+	}
+
 	/* Migrate to the requested node */
 	migrated = migrate_misplaced_page(page, vma, target_nid);
 	if (migrated) {