diff mbox series

[06/10] mm: vmscan: demote anon DRAM pages to PMEM node

Message ID 1553316275-21985-7-git-send-email-yang.shi@linux.alibaba.com (mailing list archive)
State New, archived
Headers show
Series Another Approach to Use PMEM as NUMA Node | expand

Commit Message

Yang Shi March 23, 2019, 4:44 a.m. UTC
Since PMEM provides larger capacity than DRAM and has much lower
access latency than disk, so it is a good choice to use as a middle
tier between DRAM and disk in page reclaim path.

With PMEM nodes, the demotion path of anonymous pages could be:

DRAM -> PMEM -> swap device

This patch demotes anonymous pages only for the time being and demote
THP to PMEM in a whole.  However this may cause expensive page reclaim
and/or compaction on PMEM node if there is memory pressure on it.  But,
considering the capacity of PMEM and allocation only happens on PMEM
when PMEM is specified explicity, such cases should be not that often.
So, it sounds worth keeping THP in a whole instead of splitting it.

Demote pages to the cloest non-DRAM node even though the system is
swapless.  The current logic of page reclaim just scan anon LRU when
swap is on and swappiness is set properly.  Demoting to PMEM doesn't
need care whether swap is available or not.  But, reclaiming from PMEM
still skip anon LRU is swap is not available.

The demotion just happens between DRAM node and its cloest PMEM node.
Demoting to a remote PMEM node is not allowed for now.

And, define a new migration reason for demotion, called MR_DEMOTE.
Demote page via async migration to avoid blocking.

Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
---
 include/linux/migrate.h        |  1 +
 include/trace/events/migrate.h |  3 +-
 mm/debug.c                     |  1 +
 mm/internal.h                  | 22 ++++++++++
 mm/vmscan.c                    | 99 ++++++++++++++++++++++++++++++++++--------
 5 files changed, 107 insertions(+), 19 deletions(-)

Comments

Zi Yan March 23, 2019, 6:03 a.m. UTC | #1
On 22 Mar 2019, at 21:44, Yang Shi wrote:

> Since PMEM provides larger capacity than DRAM and has much lower
> access latency than disk, so it is a good choice to use as a middle
> tier between DRAM and disk in page reclaim path.
>
> With PMEM nodes, the demotion path of anonymous pages could be:
>
> DRAM -> PMEM -> swap device
>
> This patch demotes anonymous pages only for the time being and demote
> THP to PMEM in a whole.  However this may cause expensive page reclaim
> and/or compaction on PMEM node if there is memory pressure on it.  But,
> considering the capacity of PMEM and allocation only happens on PMEM
> when PMEM is specified explicity, such cases should be not that often.
> So, it sounds worth keeping THP in a whole instead of splitting it.
>
> Demote pages to the cloest non-DRAM node even though the system is
> swapless.  The current logic of page reclaim just scan anon LRU when
> swap is on and swappiness is set properly.  Demoting to PMEM doesn't
> need care whether swap is available or not.  But, reclaiming from PMEM
> still skip anon LRU is swap is not available.
>
> The demotion just happens between DRAM node and its cloest PMEM node.
> Demoting to a remote PMEM node is not allowed for now.
>
> And, define a new migration reason for demotion, called MR_DEMOTE.
> Demote page via async migration to avoid blocking.
>
> Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
> ---
>  include/linux/migrate.h        |  1 +
>  include/trace/events/migrate.h |  3 +-
>  mm/debug.c                     |  1 +
>  mm/internal.h                  | 22 ++++++++++
>  mm/vmscan.c                    | 99 ++++++++++++++++++++++++++++++++++--------
>  5 files changed, 107 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index e13d9bf..78c8dda 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -25,6 +25,7 @@ enum migrate_reason {
>  	MR_MEMPOLICY_MBIND,
>  	MR_NUMA_MISPLACED,
>  	MR_CONTIG_RANGE,
> +	MR_DEMOTE,
>  	MR_TYPES
>  };
>
> diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
> index 705b33d..c1d5b36 100644
> --- a/include/trace/events/migrate.h
> +++ b/include/trace/events/migrate.h
> @@ -20,7 +20,8 @@
>  	EM( MR_SYSCALL,		"syscall_or_cpuset")		\
>  	EM( MR_MEMPOLICY_MBIND,	"mempolicy_mbind")		\
>  	EM( MR_NUMA_MISPLACED,	"numa_misplaced")		\
> -	EMe(MR_CONTIG_RANGE,	"contig_range")
> +	EM( MR_CONTIG_RANGE,	"contig_range")			\
> +	EMe(MR_DEMOTE,		"demote")
>
>  /*
>   * First define the enums in the above macros to be exported to userspace
> diff --git a/mm/debug.c b/mm/debug.c
> index c0b31b6..cc0d7df 100644
> --- a/mm/debug.c
> +++ b/mm/debug.c
> @@ -25,6 +25,7 @@
>  	"mempolicy_mbind",
>  	"numa_misplaced",
>  	"cma",
> +	"demote",
>  };
>
>  const struct trace_print_flags pageflag_names[] = {
> diff --git a/mm/internal.h b/mm/internal.h
> index 46ad0d8..0152300 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -303,6 +303,19 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask,
>  }
>  #endif
>
> +static inline bool has_nonram_online(void)
> +{
> +	int i = 0;
> +
> +	for_each_online_node(i) {
> +		/* Have PMEM node online? */
> +		if (!node_isset(i, def_alloc_nodemask))
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
>  /* mm/util.c */
>  void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
>  		struct vm_area_struct *prev, struct rb_node *rb_parent);
> @@ -565,5 +578,14 @@ static inline bool is_migrate_highatomic_page(struct page *page)
>  }
>
>  void setup_zone_pageset(struct zone *zone);
> +
> +#ifdef CONFIG_NUMA
>  extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
> +#else
> +static inline struct page *alloc_new_node_page(struct page *page,
> +					       unsigned long node)
> +{
> +	return NULL;
> +}
> +#endif
>  #endif	/* __MM_INTERNAL_H */
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index a5ad0b3..bdcab6b 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1094,6 +1094,19 @@ static void page_check_dirty_writeback(struct page *page,
>  		mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
>  }
>
> +static inline bool is_demote_ok(struct pglist_data *pgdat)
> +{
> +	/* Current node is not DRAM node */
> +	if (!node_isset(pgdat->node_id, def_alloc_nodemask))
> +		return false;
> +
> +	/* No online PMEM node */
> +	if (!has_nonram_online())
> +		return false;
> +
> +	return true;
> +}
> +
>  /*
>   * shrink_page_list() returns the number of reclaimed pages
>   */
> @@ -1106,6 +1119,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
>  {
>  	LIST_HEAD(ret_pages);
>  	LIST_HEAD(free_pages);
> +	LIST_HEAD(demote_pages);
>  	unsigned nr_reclaimed = 0;
>
>  	memset(stat, 0, sizeof(*stat));
> @@ -1262,6 +1276,22 @@ static unsigned long shrink_page_list(struct list_head *page_list,
>  		}
>
>  		/*
> +		 * Demote DRAM pages regardless the mempolicy.
> +		 * Demot anonymous pages only for now and skip MADV_FREE

s/Demot/Demote

> +		 * pages.
> +		 */
> +		if (PageAnon(page) && !PageSwapCache(page) &&
> +		    (node_isset(page_to_nid(page), def_alloc_nodemask)) &&
> +		    PageSwapBacked(page)) {
> +
> +			if (has_nonram_online()) {
> +				list_add(&page->lru, &demote_pages);
> +				unlock_page(page);
> +				continue;
> +			}
> +		}
> +
> +		/*
>  		 * Anonymous process memory has backing store?
>  		 * Try to allocate it some swap space here.
>  		 * Lazyfree page could be freed directly
> @@ -1477,6 +1507,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
>  		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
>  	}
>
> +	/* Demote pages to PMEM */
> +	if (!list_empty(&demote_pages)) {
> +		int err, target_nid;
> +		nodemask_t used_mask;
> +
> +		nodes_clear(used_mask);
> +		target_nid = find_next_best_node(pgdat->node_id, &used_mask,
> +						 true);
> +
> +		err = migrate_pages(&demote_pages, alloc_new_node_page, NULL,
> +				    target_nid, MIGRATE_ASYNC, MR_DEMOTE);
> +
> +		if (err) {
> +			putback_movable_pages(&demote_pages);
> +
> +			list_splice(&ret_pages, &demote_pages);
> +		}
> +	}
> +

I like your approach here. It reuses the existing migrate_pages() interface without
adding extra code. I also would like to be CC’d in your future versions.

Thank you.

--
Best Regards,
Yan Zi
Keith Busch March 24, 2019, 10:20 p.m. UTC | #2
On Sat, Mar 23, 2019 at 12:44:31PM +0800, Yang Shi wrote:
>  		/*
> +		 * Demote DRAM pages regardless the mempolicy.
> +		 * Demot anonymous pages only for now and skip MADV_FREE
> +		 * pages.
> +		 */
> +		if (PageAnon(page) && !PageSwapCache(page) &&
> +		    (node_isset(page_to_nid(page), def_alloc_nodemask)) &&
> +		    PageSwapBacked(page)) {
> +
> +			if (has_nonram_online()) {
> +				list_add(&page->lru, &demote_pages);
> +				unlock_page(page);
> +				continue;
> +			}
> +		}
> +
> +		/*
>  		 * Anonymous process memory has backing store?
>  		 * Try to allocate it some swap space here.
>  		 * Lazyfree page could be freed directly
> @@ -1477,6 +1507,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
>  		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
>  	}
>  
> +	/* Demote pages to PMEM */
> +	if (!list_empty(&demote_pages)) {
> +		int err, target_nid;
> +		nodemask_t used_mask;
> +
> +		nodes_clear(used_mask);
> +		target_nid = find_next_best_node(pgdat->node_id, &used_mask,
> +						 true);
> +
> +		err = migrate_pages(&demote_pages, alloc_new_node_page, NULL,
> +				    target_nid, MIGRATE_ASYNC, MR_DEMOTE);
> +
> +		if (err) {
> +			putback_movable_pages(&demote_pages);
> +
> +			list_splice(&ret_pages, &demote_pages);
> +		}
> +	}
> +
>  	mem_cgroup_uncharge_list(&free_pages);
>  	try_to_unmap_flush();
>  	free_unref_page_list(&free_pages);

How do these pages eventually get to swap when migration fails? Looks
like that's skipped.

And page cache demotion is useful too, we shouldn't consider only
anonymous for this feature.
Yang Shi March 25, 2019, 7:49 p.m. UTC | #3
On 3/24/19 3:20 PM, Keith Busch wrote:
> On Sat, Mar 23, 2019 at 12:44:31PM +0800, Yang Shi wrote:
>>   		/*
>> +		 * Demote DRAM pages regardless the mempolicy.
>> +		 * Demot anonymous pages only for now and skip MADV_FREE
>> +		 * pages.
>> +		 */
>> +		if (PageAnon(page) && !PageSwapCache(page) &&
>> +		    (node_isset(page_to_nid(page), def_alloc_nodemask)) &&
>> +		    PageSwapBacked(page)) {
>> +
>> +			if (has_nonram_online()) {
>> +				list_add(&page->lru, &demote_pages);
>> +				unlock_page(page);
>> +				continue;
>> +			}
>> +		}
>> +
>> +		/*
>>   		 * Anonymous process memory has backing store?
>>   		 * Try to allocate it some swap space here.
>>   		 * Lazyfree page could be freed directly
>> @@ -1477,6 +1507,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
>>   		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
>>   	}
>>   
>> +	/* Demote pages to PMEM */
>> +	if (!list_empty(&demote_pages)) {
>> +		int err, target_nid;
>> +		nodemask_t used_mask;
>> +
>> +		nodes_clear(used_mask);
>> +		target_nid = find_next_best_node(pgdat->node_id, &used_mask,
>> +						 true);
>> +
>> +		err = migrate_pages(&demote_pages, alloc_new_node_page, NULL,
>> +				    target_nid, MIGRATE_ASYNC, MR_DEMOTE);
>> +
>> +		if (err) {
>> +			putback_movable_pages(&demote_pages);
>> +
>> +			list_splice(&ret_pages, &demote_pages);
>> +		}
>> +	}
>> +
>>   	mem_cgroup_uncharge_list(&free_pages);
>>   	try_to_unmap_flush();
>>   	free_unref_page_list(&free_pages);
> How do these pages eventually get to swap when migration fails? Looks
> like that's skipped.

Yes, they will be just put back to LRU. Actually, I don't expect it 
would be very often to have migration fail at this stage (but I have no 
test data to support this hypothesis) since the pages have been isolated 
from LRU, so other reclaim path should not find them anymore.

If it is locked by someone else right before migration, it is likely 
referenced again, so putting back to LRU sounds not bad.

A potential improvement is to have sync migration for kswapd.

>
> And page cache demotion is useful too, we shouldn't consider only
> anonymous for this feature.

Yes, definitely. I'm looking into the page cache case now. Any 
suggestion is welcome.

Thanks,
Yang
Yang Shi March 25, 2019, 9:49 p.m. UTC | #4
On 3/22/19 11:03 PM, Zi Yan wrote:
> On 22 Mar 2019, at 21:44, Yang Shi wrote:
>
>> Since PMEM provides larger capacity than DRAM and has much lower
>> access latency than disk, so it is a good choice to use as a middle
>> tier between DRAM and disk in page reclaim path.
>>
>> With PMEM nodes, the demotion path of anonymous pages could be:
>>
>> DRAM -> PMEM -> swap device
>>
>> This patch demotes anonymous pages only for the time being and demote
>> THP to PMEM in a whole.  However this may cause expensive page reclaim
>> and/or compaction on PMEM node if there is memory pressure on it.  But,
>> considering the capacity of PMEM and allocation only happens on PMEM
>> when PMEM is specified explicity, such cases should be not that often.
>> So, it sounds worth keeping THP in a whole instead of splitting it.
>>
>> Demote pages to the cloest non-DRAM node even though the system is
>> swapless.  The current logic of page reclaim just scan anon LRU when
>> swap is on and swappiness is set properly.  Demoting to PMEM doesn't
>> need care whether swap is available or not.  But, reclaiming from PMEM
>> still skip anon LRU is swap is not available.
>>
>> The demotion just happens between DRAM node and its cloest PMEM node.
>> Demoting to a remote PMEM node is not allowed for now.
>>
>> And, define a new migration reason for demotion, called MR_DEMOTE.
>> Demote page via async migration to avoid blocking.
>>
>> Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
>> ---
>>   include/linux/migrate.h        |  1 +
>>   include/trace/events/migrate.h |  3 +-
>>   mm/debug.c                     |  1 +
>>   mm/internal.h                  | 22 ++++++++++
>>   mm/vmscan.c                    | 99 ++++++++++++++++++++++++++++++++++--------
>>   5 files changed, 107 insertions(+), 19 deletions(-)
>>
>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>> index e13d9bf..78c8dda 100644
>> --- a/include/linux/migrate.h
>> +++ b/include/linux/migrate.h
>> @@ -25,6 +25,7 @@ enum migrate_reason {
>>   	MR_MEMPOLICY_MBIND,
>>   	MR_NUMA_MISPLACED,
>>   	MR_CONTIG_RANGE,
>> +	MR_DEMOTE,
>>   	MR_TYPES
>>   };
>>
>> diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
>> index 705b33d..c1d5b36 100644
>> --- a/include/trace/events/migrate.h
>> +++ b/include/trace/events/migrate.h
>> @@ -20,7 +20,8 @@
>>   	EM( MR_SYSCALL,		"syscall_or_cpuset")		\
>>   	EM( MR_MEMPOLICY_MBIND,	"mempolicy_mbind")		\
>>   	EM( MR_NUMA_MISPLACED,	"numa_misplaced")		\
>> -	EMe(MR_CONTIG_RANGE,	"contig_range")
>> +	EM( MR_CONTIG_RANGE,	"contig_range")			\
>> +	EMe(MR_DEMOTE,		"demote")
>>
>>   /*
>>    * First define the enums in the above macros to be exported to userspace
>> diff --git a/mm/debug.c b/mm/debug.c
>> index c0b31b6..cc0d7df 100644
>> --- a/mm/debug.c
>> +++ b/mm/debug.c
>> @@ -25,6 +25,7 @@
>>   	"mempolicy_mbind",
>>   	"numa_misplaced",
>>   	"cma",
>> +	"demote",
>>   };
>>
>>   const struct trace_print_flags pageflag_names[] = {
>> diff --git a/mm/internal.h b/mm/internal.h
>> index 46ad0d8..0152300 100644
>> --- a/mm/internal.h
>> +++ b/mm/internal.h
>> @@ -303,6 +303,19 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask,
>>   }
>>   #endif
>>
>> +static inline bool has_nonram_online(void)
>> +{
>> +	int i = 0;
>> +
>> +	for_each_online_node(i) {
>> +		/* Have PMEM node online? */
>> +		if (!node_isset(i, def_alloc_nodemask))
>> +			return true;
>> +	}
>> +
>> +	return false;
>> +}
>> +
>>   /* mm/util.c */
>>   void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
>>   		struct vm_area_struct *prev, struct rb_node *rb_parent);
>> @@ -565,5 +578,14 @@ static inline bool is_migrate_highatomic_page(struct page *page)
>>   }
>>
>>   void setup_zone_pageset(struct zone *zone);
>> +
>> +#ifdef CONFIG_NUMA
>>   extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
>> +#else
>> +static inline struct page *alloc_new_node_page(struct page *page,
>> +					       unsigned long node)
>> +{
>> +	return NULL;
>> +}
>> +#endif
>>   #endif	/* __MM_INTERNAL_H */
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index a5ad0b3..bdcab6b 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -1094,6 +1094,19 @@ static void page_check_dirty_writeback(struct page *page,
>>   		mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
>>   }
>>
>> +static inline bool is_demote_ok(struct pglist_data *pgdat)
>> +{
>> +	/* Current node is not DRAM node */
>> +	if (!node_isset(pgdat->node_id, def_alloc_nodemask))
>> +		return false;
>> +
>> +	/* No online PMEM node */
>> +	if (!has_nonram_online())
>> +		return false;
>> +
>> +	return true;
>> +}
>> +
>>   /*
>>    * shrink_page_list() returns the number of reclaimed pages
>>    */
>> @@ -1106,6 +1119,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
>>   {
>>   	LIST_HEAD(ret_pages);
>>   	LIST_HEAD(free_pages);
>> +	LIST_HEAD(demote_pages);
>>   	unsigned nr_reclaimed = 0;
>>
>>   	memset(stat, 0, sizeof(*stat));
>> @@ -1262,6 +1276,22 @@ static unsigned long shrink_page_list(struct list_head *page_list,
>>   		}
>>
>>   		/*
>> +		 * Demote DRAM pages regardless the mempolicy.
>> +		 * Demot anonymous pages only for now and skip MADV_FREE
> s/Demot/Demote

Thanks for catching this. Will fix.

>
>> +		 * pages.
>> +		 */
>> +		if (PageAnon(page) && !PageSwapCache(page) &&
>> +		    (node_isset(page_to_nid(page), def_alloc_nodemask)) &&
>> +		    PageSwapBacked(page)) {
>> +
>> +			if (has_nonram_online()) {
>> +				list_add(&page->lru, &demote_pages);
>> +				unlock_page(page);
>> +				continue;
>> +			}
>> +		}
>> +
>> +		/*
>>   		 * Anonymous process memory has backing store?
>>   		 * Try to allocate it some swap space here.
>>   		 * Lazyfree page could be freed directly
>> @@ -1477,6 +1507,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
>>   		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
>>   	}
>>
>> +	/* Demote pages to PMEM */
>> +	if (!list_empty(&demote_pages)) {
>> +		int err, target_nid;
>> +		nodemask_t used_mask;
>> +
>> +		nodes_clear(used_mask);
>> +		target_nid = find_next_best_node(pgdat->node_id, &used_mask,
>> +						 true);
>> +
>> +		err = migrate_pages(&demote_pages, alloc_new_node_page, NULL,
>> +				    target_nid, MIGRATE_ASYNC, MR_DEMOTE);
>> +
>> +		if (err) {
>> +			putback_movable_pages(&demote_pages);
>> +
>> +			list_splice(&ret_pages, &demote_pages);
>> +		}
>> +	}
>> +
> I like your approach here. It reuses the existing migrate_pages() interface without
> adding extra code. I also would like to be CC’d in your future versions.

Yes, sure.

Thanks,
Yang

>
> Thank you.
>
> --
> Best Regards,
> Yan Zi
Keith Busch March 27, 2019, 12:35 a.m. UTC | #5
On Mon, Mar 25, 2019 at 12:49:21PM -0700, Yang Shi wrote:
> On 3/24/19 3:20 PM, Keith Busch wrote:
> > How do these pages eventually get to swap when migration fails? Looks
> > like that's skipped.
> 
> Yes, they will be just put back to LRU. Actually, I don't expect it would be
> very often to have migration fail at this stage (but I have no test data to
> support this hypothesis) since the pages have been isolated from LRU, so
> other reclaim path should not find them anymore.
> 
> If it is locked by someone else right before migration, it is likely
> referenced again, so putting back to LRU sounds not bad.
> 
> A potential improvement is to have sync migration for kswapd.

Well, it's not that migration fails only if the page is recently
referenced. Migration would fail if there isn't available memory in
the migration node, so this implementation carries an expectation that
migration nodes have higher free capacity than source nodes. And since
your attempting THP's without ever splitting them, that also requires
lower fragmentation for a successful migration.

Applications, however, may allocate and pin pages directly out of that
migration node to the point it does not have so much free capacity or
physical continuity, so we probably shouldn't assume it's the only way
to reclaim pages.
Yang Shi March 27, 2019, 3:41 a.m. UTC | #6
On 3/26/19 5:35 PM, Keith Busch wrote:
> On Mon, Mar 25, 2019 at 12:49:21PM -0700, Yang Shi wrote:
>> On 3/24/19 3:20 PM, Keith Busch wrote:
>>> How do these pages eventually get to swap when migration fails? Looks
>>> like that's skipped.
>> Yes, they will be just put back to LRU. Actually, I don't expect it would be
>> very often to have migration fail at this stage (but I have no test data to
>> support this hypothesis) since the pages have been isolated from LRU, so
>> other reclaim path should not find them anymore.
>>
>> If it is locked by someone else right before migration, it is likely
>> referenced again, so putting back to LRU sounds not bad.
>>
>> A potential improvement is to have sync migration for kswapd.
> Well, it's not that migration fails only if the page is recently
> referenced. Migration would fail if there isn't available memory in
> the migration node, so this implementation carries an expectation that
> migration nodes have higher free capacity than source nodes. And since
> your attempting THP's without ever splitting them, that also requires
> lower fragmentation for a successful migration.

Yes, it is possible. However, migrate_pages() already has logic to 
handle such case. If the target node has not enough space for migrating 
THP in a whole, it would split THP then retry with base pages.

Swapping THP has been optimized to swap in a whole too. It would try to 
add THP into swap cache in a whole, split THP if the attempt fails, then 
add base pages into swap cache.

So, I think we can leave this to migrate_pages() without splitting in 
advance all the time.

Thanks,
Yang

>
> Applications, however, may allocate and pin pages directly out of that
> migration node to the point it does not have so much free capacity or
> physical continuity, so we probably shouldn't assume it's the only way
> to reclaim pages.
Keith Busch March 27, 2019, 1:08 p.m. UTC | #7
On Tue, Mar 26, 2019 at 08:41:15PM -0700, Yang Shi wrote:
> On 3/26/19 5:35 PM, Keith Busch wrote:
> > migration nodes have higher free capacity than source nodes. And since
> > your attempting THP's without ever splitting them, that also requires
> > lower fragmentation for a successful migration.
> 
> Yes, it is possible. However, migrate_pages() already has logic to 
> handle such case. If the target node has not enough space for migrating 
> THP in a whole, it would split THP then retry with base pages.

Oh, you're right, my mistake on splitting. So you have a good best effort
migrate, but I still think it can fail for legitimate reasons that should
have a swap fallback.
Zi Yan March 27, 2019, 5 p.m. UTC | #8
On 27 Mar 2019, at 6:08, Keith Busch wrote:

> On Tue, Mar 26, 2019 at 08:41:15PM -0700, Yang Shi wrote:
>> On 3/26/19 5:35 PM, Keith Busch wrote:
>>> migration nodes have higher free capacity than source nodes. And since
>>> your attempting THP's without ever splitting them, that also requires
>>> lower fragmentation for a successful migration.
>>
>> Yes, it is possible. However, migrate_pages() already has logic to
>> handle such case. If the target node has not enough space for migrating
>> THP in a whole, it would split THP then retry with base pages.
>
> Oh, you're right, my mistake on splitting. So you have a good best effort
> migrate, but I still think it can fail for legitimate reasons that should
> have a swap fallback.

Does this mean we might want to factor out the page reclaim code in shrink_page_list()
and call it for each page, which fails to migrate to PMEM. Or do you still prefer
to migrate one page at a time, like what you did in your patch?

I ask this because I observe that migrating a list of pages can achieve higher
throughput compared to migrating individual page. For example, migrating 512 4KB
pages can achieve ~750MB/s throughput, whereas migrating one 4KB page might only
achieve ~40MB/s throughput. The experiments were done on a two-socket machine
with two Xeon E5-2650 v3 @ 2.30GHz across the QPI link.


--
Best Regards,
Yan Zi
Dave Hansen March 27, 2019, 5:05 p.m. UTC | #9
On 3/27/19 10:00 AM, Zi Yan wrote:
> I ask this because I observe that migrating a list of pages can
> achieve higher throughput compared to migrating individual page.
> For example, migrating 512 4KB pages can achieve ~750MB/s
> throughput, whereas migrating one 4KB page might only achieve
> ~40MB/s throughput. The experiments were done on a two-socket
> machine with two Xeon E5-2650 v3 @ 2.30GHz across the QPI link.

What kind of migration?

If you're talking about doing sys_migrate_pages() one page at a time,
that's a world away from doing something inside of the kernel one page
at a time.
Zi Yan March 27, 2019, 5:48 p.m. UTC | #10
On 27 Mar 2019, at 10:05, Dave Hansen wrote:

> On 3/27/19 10:00 AM, Zi Yan wrote:
>> I ask this because I observe that migrating a list of pages can
>> achieve higher throughput compared to migrating individual page.
>> For example, migrating 512 4KB pages can achieve ~750MB/s
>> throughput, whereas migrating one 4KB page might only achieve
>> ~40MB/s throughput. The experiments were done on a two-socket
>> machine with two Xeon E5-2650 v3 @ 2.30GHz across the QPI link.
>
> What kind of migration?
>
> If you're talking about doing sys_migrate_pages() one page at a time,
> that's a world away from doing something inside of the kernel one page
> at a time.

For 40MB/s vs 750MB/s, they were using sys_migrate_pages(). Sorry about
the confusion there. As I measure only the migrate_pages() in the kernel,
the throughput becomes:
migrating 4KB page: 0.312GB/s vs migrating 512 4KB pages: 0.854GB/s.
They are still >2x difference.

Furthermore, if we only consider the migrate_page_copy() in mm/migrate.c,
which only calls copy_highpage() and migrate_page_states(), the throughput
becomes:
migrating 4KB page: 1.385GB/s vs migrating 512 4KB pages: 1.983GB/s.
The gap is smaller, but migrating 512 4KB pages still achieves 40% more
throughput.

Do these numbers make sense to you?

--
Best Regards,
Yan Zi
Dave Hansen March 27, 2019, 6 p.m. UTC | #11
On 3/27/19 10:48 AM, Zi Yan wrote:
> For 40MB/s vs 750MB/s, they were using sys_migrate_pages(). Sorry
> about the confusion there. As I measure only the migrate_pages() in
> the kernel, the throughput becomes: migrating 4KB page: 0.312GB/s
> vs migrating 512 4KB pages: 0.854GB/s. They are still >2x
> difference.
> 
> Furthermore, if we only consider the migrate_page_copy() in
> mm/migrate.c, which only calls copy_highpage() and
> migrate_page_states(), the throughput becomes: migrating 4KB page:
> 1.385GB/s vs migrating 512 4KB pages: 1.983GB/s. The gap is
> smaller, but migrating 512 4KB pages still achieves 40% more 
> throughput.
> 
> Do these numbers make sense to you?

Yes.  It would be very interesting to batch the migrations in the
kernel and see how it affects the code.  A 50% boost is interesting,
but not if it's only in microbenchmarks and takes 2k lines of code.

50% is *very* interesting if it happens in the real world and we can
do it in 10 lines of code.

So, let's see what the code looks like.
Zi Yan March 27, 2019, 8:37 p.m. UTC | #12
On 27 Mar 2019, at 11:00, Dave Hansen wrote:

> On 3/27/19 10:48 AM, Zi Yan wrote:
>> For 40MB/s vs 750MB/s, they were using sys_migrate_pages(). Sorry
>> about the confusion there. As I measure only the migrate_pages() in
>> the kernel, the throughput becomes: migrating 4KB page: 0.312GB/s
>> vs migrating 512 4KB pages: 0.854GB/s. They are still >2x
>> difference.
>>
>> Furthermore, if we only consider the migrate_page_copy() in
>> mm/migrate.c, which only calls copy_highpage() and
>> migrate_page_states(), the throughput becomes: migrating 4KB page:
>> 1.385GB/s vs migrating 512 4KB pages: 1.983GB/s. The gap is
>> smaller, but migrating 512 4KB pages still achieves 40% more
>> throughput.
>>
>> Do these numbers make sense to you?
>
> Yes.  It would be very interesting to batch the migrations in the
> kernel and see how it affects the code.  A 50% boost is interesting,
> but not if it's only in microbenchmarks and takes 2k lines of code.
>
> 50% is *very* interesting if it happens in the real world and we can
> do it in 10 lines of code.
>
> So, let's see what the code looks like.

Actually, the migration throughput difference does not come from any kernel
changes, it is a pure comparison between migrate_pages(single 4KB page) and
migrate_pages(a list of 4KB pages). The point I wanted to make is that
Yang’s approach, which migrates a list of pages at the end of shrink_page_list(),
can achieve higher throughput than Keith’s approach, which migrates one page
at a time in the while loop inside shrink_page_list().

In addition to the above, migrating a single THP can get us even higher throughput.
Here is the throughput numbers comparing all three cases:
                             |  migrate_page()  |    migrate_page_copy()
migrating single 4KB page:   |  0.312GB/s       |   1.385GB/s
migrating 512 4KB pages:     |  0.854GB/s       |   1.983GB/s
migrating single 2MB THP:    |  2.387GB/s       |   2.481GB/s

Obviously, we would like to migrate THPs as a whole instead of 512 4KB pages
individually. Of course, this assumes we have free space in PMEM for THPs and
all subpages in the THPs are cold.


To batch the migration, I posted some code a while ago: https://lwn.net/Articles/714991/,
which show good throughput improvement for microbenchmarking sys_migrate_page().
It also included using multi threads to copy a page, aggregate multiple migrate_page_copy(),
and even using DMA instead of CPUs to copy data. We could revisit the code if necessary.

In terms of end-to-end results, I also have some results from my paper:
http://www.cs.yale.edu/homes/abhishek/ziyan-asplos19.pdf (Figure 8 to Figure 11 show the
microbenchmark result and Figure 12 shows end-to-end results). I basically called
shrink_active/inactive_list() every 5 seconds to track page hotness and used all my page
migration optimizations above, which can get 40% application runtime speedup on average.
The experiments were done in a two-socket NUMA machine where one node was slowed down to
have 1/2 BW and 2x access latency, compared to the other node. I can discuss about it
more if you are interested.


--
Best Regards,
Yan Zi
Dave Hansen March 27, 2019, 8:42 p.m. UTC | #13
On 3/27/19 1:37 PM, Zi Yan wrote:
> Actually, the migration throughput difference does not come from
> any kernel changes, it is a pure comparison between
> migrate_pages(single 4KB page) and migrate_pages(a list of 4KB
> pages). The point I wanted to make is that Yang’s approach, which
> migrates a list of pages at the end of shrink_page_list(), can
> achieve higher throughput than Keith’s approach, which migrates one
> page at a time in the while loop inside shrink_page_list().

I look forward to seeing the patches.
Yang Shi March 28, 2019, 9:59 p.m. UTC | #14
On 3/27/19 6:08 AM, Keith Busch wrote:
> On Tue, Mar 26, 2019 at 08:41:15PM -0700, Yang Shi wrote:
>> On 3/26/19 5:35 PM, Keith Busch wrote:
>>> migration nodes have higher free capacity than source nodes. And since
>>> your attempting THP's without ever splitting them, that also requires
>>> lower fragmentation for a successful migration.
>> Yes, it is possible. However, migrate_pages() already has logic to
>> handle such case. If the target node has not enough space for migrating
>> THP in a whole, it would split THP then retry with base pages.
> Oh, you're right, my mistake on splitting. So you have a good best effort
> migrate, but I still think it can fail for legitimate reasons that should
> have a swap fallback.

Yes, it still could fail. I can't tell which way is better for now. I 
just thought scanning another round then migrating should be still 
faster than swapping off the top of my head.

Thanks,
Yang
Keith Busch March 28, 2019, 10:45 p.m. UTC | #15
On Thu, Mar 28, 2019 at 02:59:30PM -0700, Yang Shi wrote:
> Yes, it still could fail. I can't tell which way is better for now. I 
> just thought scanning another round then migrating should be still 
> faster than swapping off the top of my head.

I think it depends on the relative capacities between your primary and
migration tiers and how it's used. Applications may allocate and pin
directly out of pmem if they wish, so it's not a dedicated fallback
memory space like swap.
diff mbox series

Patch

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index e13d9bf..78c8dda 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -25,6 +25,7 @@  enum migrate_reason {
 	MR_MEMPOLICY_MBIND,
 	MR_NUMA_MISPLACED,
 	MR_CONTIG_RANGE,
+	MR_DEMOTE,
 	MR_TYPES
 };
 
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 705b33d..c1d5b36 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -20,7 +20,8 @@ 
 	EM( MR_SYSCALL,		"syscall_or_cpuset")		\
 	EM( MR_MEMPOLICY_MBIND,	"mempolicy_mbind")		\
 	EM( MR_NUMA_MISPLACED,	"numa_misplaced")		\
-	EMe(MR_CONTIG_RANGE,	"contig_range")
+	EM( MR_CONTIG_RANGE,	"contig_range")			\
+	EMe(MR_DEMOTE,		"demote")
 
 /*
  * First define the enums in the above macros to be exported to userspace
diff --git a/mm/debug.c b/mm/debug.c
index c0b31b6..cc0d7df 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -25,6 +25,7 @@ 
 	"mempolicy_mbind",
 	"numa_misplaced",
 	"cma",
+	"demote",
 };
 
 const struct trace_print_flags pageflag_names[] = {
diff --git a/mm/internal.h b/mm/internal.h
index 46ad0d8..0152300 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -303,6 +303,19 @@  static inline int find_next_best_node(int node, nodemask_t *used_node_mask,
 }
 #endif
 
+static inline bool has_nonram_online(void)
+{
+	int i = 0;
+
+	for_each_online_node(i) {
+		/* Have PMEM node online? */
+		if (!node_isset(i, def_alloc_nodemask))
+			return true;
+	}
+
+	return false;
+}
+
 /* mm/util.c */
 void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
 		struct vm_area_struct *prev, struct rb_node *rb_parent);
@@ -565,5 +578,14 @@  static inline bool is_migrate_highatomic_page(struct page *page)
 }
 
 void setup_zone_pageset(struct zone *zone);
+
+#ifdef CONFIG_NUMA
 extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
+#else
+static inline struct page *alloc_new_node_page(struct page *page,
+					       unsigned long node)
+{
+	return NULL;
+}
+#endif
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a5ad0b3..bdcab6b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1094,6 +1094,19 @@  static void page_check_dirty_writeback(struct page *page,
 		mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
 }
 
+static inline bool is_demote_ok(struct pglist_data *pgdat)
+{
+	/* Current node is not DRAM node */
+	if (!node_isset(pgdat->node_id, def_alloc_nodemask))
+		return false;
+
+	/* No online PMEM node */
+	if (!has_nonram_online())
+		return false;
+
+	return true;
+}
+
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -1106,6 +1119,7 @@  static unsigned long shrink_page_list(struct list_head *page_list,
 {
 	LIST_HEAD(ret_pages);
 	LIST_HEAD(free_pages);
+	LIST_HEAD(demote_pages);
 	unsigned nr_reclaimed = 0;
 
 	memset(stat, 0, sizeof(*stat));
@@ -1262,6 +1276,22 @@  static unsigned long shrink_page_list(struct list_head *page_list,
 		}
 
 		/*
+		 * Demote DRAM pages regardless the mempolicy.
+		 * Demot anonymous pages only for now and skip MADV_FREE
+		 * pages.
+		 */
+		if (PageAnon(page) && !PageSwapCache(page) &&
+		    (node_isset(page_to_nid(page), def_alloc_nodemask)) &&
+		    PageSwapBacked(page)) {
+
+			if (has_nonram_online()) {
+				list_add(&page->lru, &demote_pages);
+				unlock_page(page);
+				continue;
+			}
+		}
+
+		/*
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 * Lazyfree page could be freed directly
@@ -1477,6 +1507,25 @@  static unsigned long shrink_page_list(struct list_head *page_list,
 		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
 	}
 
+	/* Demote pages to PMEM */
+	if (!list_empty(&demote_pages)) {
+		int err, target_nid;
+		nodemask_t used_mask;
+
+		nodes_clear(used_mask);
+		target_nid = find_next_best_node(pgdat->node_id, &used_mask,
+						 true);
+
+		err = migrate_pages(&demote_pages, alloc_new_node_page, NULL,
+				    target_nid, MIGRATE_ASYNC, MR_DEMOTE);
+
+		if (err) {
+			putback_movable_pages(&demote_pages);
+
+			list_splice(&ret_pages, &demote_pages);
+		}
+	}
+
 	mem_cgroup_uncharge_list(&free_pages);
 	try_to_unmap_flush();
 	free_unref_page_list(&free_pages);
@@ -2188,10 +2237,11 @@  static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
 	unsigned long gb;
 
 	/*
-	 * If we don't have swap space, anonymous page deactivation
-	 * is pointless.
+	 * If we don't have swap space or PMEM online, anonymous page
+	 * deactivation is pointless.
 	 */
-	if (!file && !total_swap_pages)
+	if (!file && !total_swap_pages &&
+	    !is_demote_ok(pgdat))
 		return false;
 
 	inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
@@ -2271,22 +2321,34 @@  static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	unsigned long ap, fp;
 	enum lru_list lru;
 
-	/* If we have no swap space, do not bother scanning anon pages. */
-	if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
-		scan_balance = SCAN_FILE;
-		goto out;
-	}
-
 	/*
-	 * Global reclaim will swap to prevent OOM even with no
-	 * swappiness, but memcg users want to use this knob to
-	 * disable swapping for individual groups completely when
-	 * using the memory controller's swap limit feature would be
-	 * too expensive.
+	 * Anon pages can be demoted to PMEM. If there is PMEM node online,
+	 * still scan anonymous LRU even though the systme is swapless or
+	 * swapping is disabled by memcg.
+	 *
+	 * If current node is already PMEM node, demotion is not applicable.
 	 */
-	if (!global_reclaim(sc) && !swappiness) {
-		scan_balance = SCAN_FILE;
-		goto out;
+	if (!is_demote_ok(pgdat)) {
+		/*
+		 * If we have no swap space, do not bother scanning
+		 * anon pages.
+		 */
+		if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
+			scan_balance = SCAN_FILE;
+			goto out;
+		}
+
+		/*
+		 * Global reclaim will swap to prevent OOM even with no
+		 * swappiness, but memcg users want to use this knob to
+		 * disable swapping for individual groups completely when
+		 * using the memory controller's swap limit feature would be
+		 * too expensive.
+		 */
+		if (!global_reclaim(sc) && !swappiness) {
+			scan_balance = SCAN_FILE;
+			goto out;
+		}
 	}
 
 	/*
@@ -3332,7 +3394,8 @@  static void age_active_anon(struct pglist_data *pgdat,
 {
 	struct mem_cgroup *memcg;
 
-	if (!total_swap_pages)
+	/* Aging anon page as long as demotion is fine */
+	if (!total_swap_pages && !is_demote_ok(pgdat))
 		return;
 
 	memcg = mem_cgroup_iter(NULL, NULL, NULL);