diff mbox series

[3/3] drivers/base/node: create a partial offline hints under each node

Message ID 1537327066-27852-4-git-send-email-kernelfans@gmail.com (mailing list archive)
State New, archived
Headers show
Series introduce a new state 'isolate' for memblock to split the isolation and migration steps | expand

Commit Message

Pingfan Liu Sept. 19, 2018, 3:17 a.m. UTC
When offline mem, there are two cases: 1st, offline all of memblock under a
node. 2nd, only offline and replace part of mem under a node. For the 2nd
case, there is not need to alloc new page from other nodes, which may incur
extra numa fault to resolve the misplaced issue, and place unnecessary mem
pressure on other nodes. The patch suggests to introduce an interface
 /sys/../node/nodeX/partial_offline to let the user order how to
allocate a new page, i.e. from local node or other nodes.

Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 drivers/base/node.c    | 33 +++++++++++++++++++++++++++++++++
 include/linux/mmzone.h |  1 +
 mm/memory_hotplug.c    | 31 +++++++++++++++++++------------
 3 files changed, 53 insertions(+), 12 deletions(-)

Comments

kernel test robot Sept. 19, 2018, 4:36 a.m. UTC | #1
Hi Pingfan,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v4.19-rc4 next-20180918]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Pingfan-Liu/introduce-a-new-state-isolate-for-memblock-to-split-the-isolation-and-migration-steps/20180919-112650
config: x86_64-randconfig-x018-201837 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All warnings (new ones prefixed by >>):

   mm/memory_hotplug.c: In function 'do_migrate_range':
>> mm/memory_hotplug.c:1442:53: warning: passing argument 4 of 'migrate_pages' makes integer from pointer without a cast [-Wint-conversion]
      ret = migrate_pages(&source, new_node_page, NULL, &nmask,
                                                        ^
   In file included from mm/memory_hotplug.c:27:0:
   include/linux/migrate.h:68:12: note: expected 'long unsigned int' but argument is of type 'nodemask_t * {aka struct <anonymous> *}'
    extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
               ^~~~~~~~~~~~~

vim +/migrate_pages +1442 mm/memory_hotplug.c

  1356	
  1357	#define NR_OFFLINE_AT_ONCE_PAGES	(256)
  1358	static int
  1359	do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
  1360	{
  1361		unsigned long pfn;
  1362		struct page *page;
  1363		int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
  1364		int not_managed = 0;
  1365		int ret = 0;
  1366		LIST_HEAD(source);
  1367		int nid;
  1368		nodemask_t nmask = node_states[N_MEMORY];
  1369	
  1370		for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
  1371			if (!pfn_valid(pfn))
  1372				continue;
  1373			page = pfn_to_page(pfn);
  1374	
  1375			if (PageHuge(page)) {
  1376				struct page *head = compound_head(page);
  1377				pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
  1378				if (compound_order(head) > PFN_SECTION_SHIFT) {
  1379					ret = -EBUSY;
  1380					break;
  1381				}
  1382				if (isolate_huge_page(page, &source))
  1383					move_pages -= 1 << compound_order(head);
  1384				continue;
  1385			} else if (PageTransHuge(page))
  1386				pfn = page_to_pfn(compound_head(page))
  1387					+ hpage_nr_pages(page) - 1;
  1388	
  1389			if (!get_page_unless_zero(page))
  1390				continue;
  1391			/*
  1392			 * We can skip free pages. And we can deal with pages on
  1393			 * LRU and non-lru movable pages.
  1394			 */
  1395			if (PageLRU(page))
  1396				ret = isolate_lru_page(page);
  1397			else
  1398				ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
  1399			if (!ret) { /* Success */
  1400				put_page(page);
  1401				list_add_tail(&page->lru, &source);
  1402				move_pages--;
  1403				if (!__PageMovable(page))
  1404					inc_node_page_state(page, NR_ISOLATED_ANON +
  1405							    page_is_file_cache(page));
  1406	
  1407			} else {
  1408	#ifdef CONFIG_DEBUG_VM
  1409				pr_alert("failed to isolate pfn %lx\n", pfn);
  1410				dump_page(page, "isolation failed");
  1411	#endif
  1412				put_page(page);
  1413				/* Because we don't have big zone->lock. we should
  1414				   check this again here. */
  1415				if (page_count(page)) {
  1416					not_managed++;
  1417					ret = -EBUSY;
  1418					break;
  1419				}
  1420			}
  1421		}
  1422		if (!list_empty(&source)) {
  1423			if (not_managed) {
  1424				putback_movable_pages(&source);
  1425				goto out;
  1426			}
  1427	
  1428			page = list_entry(source.next, struct page, lru);
  1429			nid = page_to_nid(page);
  1430			if (!NODE_DATA(nid)->partial_offline) {
  1431				/*
  1432				 * try to allocate from a different node but reuse this
  1433				 * node if there are no other online nodes to be used
  1434				 * (e.g. we are offlining a part of the only existing
  1435				 * node)
  1436				 */
  1437				node_clear(nid, nmask);
  1438				if (nodes_empty(nmask))
  1439					node_set(nid, nmask);
  1440			}
  1441			/* Allocate a new page from the nearest neighbor node */
> 1442			ret = migrate_pages(&source, new_node_page, NULL, &nmask,
  1443						MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
  1444			if (ret)
  1445				putback_movable_pages(&source);
  1446		}
  1447	out:
  1448		return ret;
  1449	}
  1450	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
diff mbox series

Patch

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 1ac4c36..64b0cb8 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -25,6 +25,36 @@  static struct bus_type node_subsys = {
 	.dev_name = "node",
 };
 
+static ssize_t read_partial_offline(struct device *dev,
+	struct device_attribute *attr, char *buf)
+{
+	int nid = dev->id;
+	struct pglist_data *pgdat = NODE_DATA(nid);
+	ssize_t len = 0;
+
+	if (pgdat->partial_offline)
+		len = sprintf(buf, "1\n");
+	else
+		len = sprintf(buf, "0\n");
+
+	return len;
+}
+
+static ssize_t write_partial_offline(struct device *dev,
+	struct device_attribute *attr, const char *buf, size_t count)
+{
+	int nid = dev->id;
+	struct pglist_data *pgdat = NODE_DATA(nid);
+
+	if (sysfs_streq(buf, "1"))
+		pgdat->partial_offline = true;
+	else if (sysfs_streq(buf, "0"))
+		pgdat->partial_offline = false;
+	else
+		return -EINVAL;
+
+	return strlen(buf);
+}
 
 static ssize_t node_read_cpumap(struct device *dev, bool list, char *buf)
 {
@@ -56,6 +86,8 @@  static inline ssize_t node_read_cpulist(struct device *dev,
 	return node_read_cpumap(dev, true, buf);
 }
 
+static DEVICE_ATTR(partial_offline, 0600, read_partial_offline,
+	write_partial_offline);
 static DEVICE_ATTR(cpumap,  S_IRUGO, node_read_cpumask, NULL);
 static DEVICE_ATTR(cpulist, S_IRUGO, node_read_cpulist, NULL);
 
@@ -235,6 +267,7 @@  static struct attribute *node_dev_attrs[] = {
 	&dev_attr_numastat.attr,
 	&dev_attr_distance.attr,
 	&dev_attr_vmstat.attr,
+	&dev_attr_partial_offline.attr,
 	NULL
 };
 ATTRIBUTE_GROUPS(node_dev);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1e22d96..80c44c8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -722,6 +722,7 @@  typedef struct pglist_data {
 	/* Per-node vmstats */
 	struct per_cpu_nodestat __percpu *per_cpu_nodestats;
 	atomic_long_t		vm_stat[NR_VM_NODE_STAT_ITEMS];
+	bool	partial_offline;
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 228de4d..3c66075 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1346,18 +1346,10 @@  static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
 
 static struct page *new_node_page(struct page *page, unsigned long private)
 {
-	int nid = page_to_nid(page);
-	nodemask_t nmask = node_states[N_MEMORY];
-
-	/*
-	 * try to allocate from a different node but reuse this node if there
-	 * are no other online nodes to be used (e.g. we are offlining a part
-	 * of the only existing node)
-	 */
-	node_clear(nid, nmask);
-	if (nodes_empty(nmask))
-		node_set(nid, nmask);
+	nodemask_t nmask = *(nodemask_t *)private;
+	int nid;
 
+	nid = page_to_nid(page);
 	return new_page_nodemask(page, nid, &nmask);
 }
 
@@ -1371,6 +1363,8 @@  do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 	int not_managed = 0;
 	int ret = 0;
 	LIST_HEAD(source);
+	int nid;
+	nodemask_t nmask = node_states[N_MEMORY];
 
 	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
 		if (!pfn_valid(pfn))
@@ -1430,8 +1424,21 @@  do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 			goto out;
 		}
 
+		page = list_entry(source.next, struct page, lru);
+		nid = page_to_nid(page);
+		if (!NODE_DATA(nid)->partial_offline) {
+			/*
+			 * try to allocate from a different node but reuse this
+			 * node if there are no other online nodes to be used
+			 * (e.g. we are offlining a part of the only existing
+			 * node)
+			 */
+			node_clear(nid, nmask);
+			if (nodes_empty(nmask))
+				node_set(nid, nmask);
+		}
 		/* Allocate a new page from the nearest neighbor node */
-		ret = migrate_pages(&source, new_node_page, NULL, 0,
+		ret = migrate_pages(&source, new_node_page, NULL, &nmask,
 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
 		if (ret)
 			putback_movable_pages(&source);