diff mbox series

mm: sort freelist by rank number

Message ID 1596435031-41837-1-git-send-email-pullip.cho@samsung.com (mailing list archive)
State New, archived
Headers show
Series mm: sort freelist by rank number | expand

Commit Message

Cho KyongHo Aug. 3, 2020, 6:10 a.m. UTC
From: Cho KyongHo <pullip.cho@samsung.com>

LPDDR5 introduces rank switch delay. If three successive DRAM accesses
happens and the first and the second ones access one rank and the last
access happens on the other rank, the latency of the last access will
be longer than the second one.
To address this panelty, we can sort the freelist so that a specific
rank is allocated prior to another rank. We expect the page allocator
can allocate the pages from the same rank successively with this
change. It will hopefully improves the proportion of the consecutive
memory accesses to the same rank.

Signed-off-by: Cho KyongHo <pullip.cho@samsung.com>
---
 mm/Kconfig      |  23 +++++++++++
 mm/compaction.c |   6 +--
 mm/internal.h   |  11 ++++++
 mm/page_alloc.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 mm/shuffle.h    |   6 ++-
 5 files changed, 144 insertions(+), 21 deletions(-)

Comments

David Hildenbrand Aug. 3, 2020, 7:57 a.m. UTC | #1
On 03.08.20 08:10, pullip.cho@samsung.com wrote:
> From: Cho KyongHo <pullip.cho@samsung.com>
> 
> LPDDR5 introduces rank switch delay. If three successive DRAM accesses
> happens and the first and the second ones access one rank and the last
> access happens on the other rank, the latency of the last access will
> be longer than the second one.
> To address this panelty, we can sort the freelist so that a specific
> rank is allocated prior to another rank. We expect the page allocator
> can allocate the pages from the same rank successively with this
> change. It will hopefully improves the proportion of the consecutive
> memory accesses to the same rank.

This certainly needs performance numbers to justify ... and I am sorry,
"hopefully improves" is not a valid justification :)

I can imagine that this works well initially, when there hasn't been a
lot of memory fragmentation going on. But quickly after your system is
under stress, I doubt this will be very useful. Proof me wrong. ;)

... I dislike this manual setting of "dram_rank_granule". Yet another mm
feature that can only be enabled by a magic command line parameter where
users have to guess the right values.

(side note, there have been similar research approaches to improve
energy consumption by switching off ranks when not needed).

> 
> Signed-off-by: Cho KyongHo <pullip.cho@samsung.com>
> ---
>  mm/Kconfig      |  23 +++++++++++
>  mm/compaction.c |   6 +--
>  mm/internal.h   |  11 ++++++
>  mm/page_alloc.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++-------
>  mm/shuffle.h    |   6 ++-
>  5 files changed, 144 insertions(+), 21 deletions(-)
> 
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 6c97488..789c02b 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -868,4 +868,27 @@ config ARCH_HAS_HUGEPD
>  config MAPPING_DIRTY_HELPERS
>          bool
>  
> +config RANK_SORTED_FREELIST
> +	bool "Prefer allocating free pages in a half of DRAM ranks"
> +
> +	help
> +	  Rank switch delay in DRAM access become larger in LPDDR5 than before.
> +	  If two successive memory accesses happen on different ranks in LPDDR5,
> +	  the latency of the second access becomes longer due to the rank switch
> +	  overhead. This is a power-performance tradeoff achieved in LPDDR5.
> +	  By default, sorting freelist by rank number is disabled even though
> +	  RANK_SORTED_FREELIST is set to y. To enable, set "dram_rank_granule"
> +	  boot argument to a larger or an equal value to pageblock_nr_pages. The
> +	  values should be the exact the rank interleaving granule that your
> +	  system is using. The rank interleaving granule is 2^(the lowest CS bit
> +	  number). CS stands for Chip Select and is also called SS which stands
> +	  for Slave Select.
> +	  This is not beneficial to single rank memory system. Also this is not
> +	  necessary to quad rank and octal rank memory systems because they are
> +	  not in LPDDR5 specifications.
> +
> +	  This is marked experimental because this disables freelist shuffling
> +	  (SHUFFLE_PAGE_ALLOCATOR). Also you should set the correct rank
> +	  interleaving granule.
> +
>  endmenu
> diff --git a/mm/compaction.c b/mm/compaction.c
> index 061dacf..bee9567 100644
> --- a/mm/compaction.c
> +++ b/mm/compaction.c
> @@ -1218,8 +1218,7 @@ move_freelist_head(struct list_head *freelist, struct page *freepage)
>  
>  	if (!list_is_last(freelist, &freepage->lru)) {
>  		list_cut_before(&sublist, freelist, &freepage->lru);
> -		if (!list_empty(&sublist))
> -			list_splice_tail(&sublist, freelist);
> +		freelist_splice_tail(&sublist, freelist);
>  	}
>  }
>  
> @@ -1236,8 +1235,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage)
>  
>  	if (!list_is_first(freelist, &freepage->lru)) {
>  		list_cut_position(&sublist, freelist, &freepage->lru);
> -		if (!list_empty(&sublist))
> -			list_splice_tail(&sublist, freelist);
> +		freelist_splice_tail(&sublist, freelist);
>  	}
>  }
>  
> diff --git a/mm/internal.h b/mm/internal.h
> index 10c6776..c945b3d 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -266,6 +266,17 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
>  
>  #endif
>  
> +#ifdef CONFIG_RANK_SORTED_FREELIST
> +void freelist_splice_tail(struct list_head *sublist, struct list_head *freelist);
> +#else
> +#include <linux/list.h>
> +static inline
> +void freelist_splice_tail(struct list_head *sublist, struct list_head *freelist)
> +{
> +	if (!list_empty(sublist))
> +		list_splice_tail(sublist, freelist);
> +}
> +#endif
>  /*
>   * This function returns the order of a free page in the buddy system. In
>   * general, page_zone(page)->lock must be held by the caller to prevent the
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 2824e116..7823a3b 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -854,6 +854,69 @@ compaction_capture(struct capture_control *capc, struct page *page,
>  }
>  #endif /* CONFIG_COMPACTION */
>  
> +#ifdef CONFIG_RANK_SORTED_FREELIST
> +static unsigned long dram_rank_nr_pages __read_mostly;
> +
> +static inline bool preferred_rank_enabled(void)
> +{
> +	return dram_rank_nr_pages >= pageblock_nr_pages;
> +}
> +
> +static int __init dram_rank_granule(char *buf)
> +{
> +	unsigned long val = (unsigned long)(memparse(buf, NULL) / PAGE_SIZE);
> +
> +	if (val < pageblock_nr_pages) {
> +		pr_err("too small rank granule %lu\n", val);
> +		return -EINVAL;
> +	}
> +
> +	dram_rank_nr_pages = val;
> +
> +	return 0;
> +}
> +
> +early_param("dram_rank_granule", dram_rank_granule);
> +
> +static inline bool __preferred_rank(struct page *page)
> +{
> +	return !(page_to_pfn(page) & dram_rank_nr_pages);
> +}
> +
> +static inline bool preferred_rank(struct page *page)
> +{
> +	return !preferred_rank_enabled() || __preferred_rank(page);
> +}
> +
> +void freelist_splice_tail(struct list_head *sublist, struct list_head *freelist)
> +{
> +	while (!list_empty(sublist)) {
> +		struct page *page;
> +
> +		page = list_first_entry(sublist, struct page, lru);
> +		if (!preferred_rank_enabled() || !__preferred_rank(page))
> +			list_move_tail(&page->lru, freelist);
> +		else
> +			list_move(&page->lru, freelist);
> +	}
> +}
> +#else
> +static inline bool __preferred_rank(struct page *page)
> +{
> +	return true;
> +}
> +
> +static inline bool preferred_rank(struct page *page)
> +{
> +	return true;
> +}
> +
> +static inline bool preferred_rank_enabled(void)
> +{
> +	return false;
> +}
> +#endif
> +
>  /* Used for pages not on another list */
>  static inline void add_to_free_list(struct page *page, struct zone *zone,
>  				    unsigned int order, int migratetype)
> @@ -880,7 +943,10 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
>  {
>  	struct free_area *area = &zone->free_area[order];
>  
> -	list_move(&page->lru, &area->free_list[migratetype]);
> +	if (preferred_rank(page))
> +		list_move(&page->lru, &area->free_list[migratetype]);
> +	else
> +		list_move_tail(&page->lru, &area->free_list[migratetype]);
>  }
>  
>  static inline void del_page_from_free_list(struct page *page, struct zone *zone,
> @@ -1029,7 +1095,9 @@ static inline void __free_one_page(struct page *page,
>  done_merging:
>  	set_page_order(page, order);
>  
> -	if (is_shuffle_order(order))
> +	if (preferred_rank_enabled())
> +		to_tail = !__preferred_rank(page);
> +	else if (is_shuffle_order(order))
>  		to_tail = shuffle_pick_tail();
>  	else
>  		to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
> @@ -2257,20 +2325,29 @@ static __always_inline
>  struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
>  						int migratetype)
>  {
> +	int retry = preferred_rank_enabled() ? 2 : 1;
>  	unsigned int current_order;
>  	struct free_area *area;
>  	struct page *page;
>  
> -	/* Find a page of the appropriate size in the preferred list */
> -	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
> -		area = &(zone->free_area[current_order]);
> -		page = get_page_from_free_area(area, migratetype);
> -		if (!page)
> -			continue;
> -		del_page_from_free_list(page, zone, current_order);
> -		expand(zone, page, order, current_order, migratetype);
> -		set_pcppage_migratetype(page, migratetype);
> -		return page;
> +	while (retry-- > 0) {
> +		/* Find a page of the appropriate size in the preferred list */
> +		for (current_order = order; current_order < MAX_ORDER; ++current_order) {
> +			area = &zone->free_area[current_order];
> +			page = get_page_from_free_area(area, migratetype);
> +			if (!page)
> +				continue;
> +			/*
> +			 * In the first try, search for a page in the preferred
> +			 * rank upward even though a free page is found.
> +			 */
> +			if (retry > 0 && !preferred_rank(page))
> +				continue;
> +			del_page_from_free_list(page, zone, current_order);
> +			expand(zone, page, order, current_order, migratetype);
> +			set_pcppage_migratetype(page, migratetype);
> +			return page;
> +		}
>  	}
>  
>  	return NULL;
> @@ -2851,8 +2928,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
>  		 * head, thus also in the physical page order. This is useful
>  		 * for IO devices that can merge IO requests if the physical
>  		 * pages are ordered properly.
> +		 * However, preferred_rank_enabled() is true, we always sort
> +		 * freelists in the buddy and the pcp in the order of rank
> +		 * number for the performance reason.
>  		 */
> -		list_add_tail(&page->lru, list);
> +		if (preferred_rank_enabled() && __preferred_rank(page))
> +			list_add(&page->lru, list);
> +		else
> +			list_add_tail(&page->lru, list);
>  		alloced++;
>  		if (is_migrate_cma(get_pcppage_migratetype(page)))
>  			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
> @@ -3136,7 +3219,10 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
>  	}
>  
>  	pcp = &this_cpu_ptr(zone->pageset)->pcp;
> -	list_add(&page->lru, &pcp->lists[migratetype]);
> +	if (preferred_rank(page))
> +		list_add(&page->lru, &pcp->lists[migratetype]);
> +	else
> +		list_add_tail(&page->lru, &pcp->lists[migratetype]);
>  	pcp->count++;
>  	if (pcp->count >= pcp->high) {
>  		unsigned long batch = READ_ONCE(pcp->batch);
> @@ -8813,7 +8899,10 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page,
>  			continue;
>  
>  		if (current_buddy != target) {
> -			add_to_free_list(current_buddy, zone, high, migratetype);
> +			if (preferred_rank(current_buddy))
> +				add_to_free_list(current_buddy, zone, high, migratetype);
> +			else
> +				add_to_free_list_tail(current_buddy, zone, high, migratetype);
>  			set_page_order(current_buddy, high);
>  			page = next_page;
>  		}
> diff --git a/mm/shuffle.h b/mm/shuffle.h
> index 71b784f..59cbfde 100644
> --- a/mm/shuffle.h
> +++ b/mm/shuffle.h
> @@ -12,7 +12,8 @@ extern void __shuffle_free_memory(pg_data_t *pgdat);
>  extern bool shuffle_pick_tail(void);
>  static inline void shuffle_free_memory(pg_data_t *pgdat)
>  {
> -	if (!static_branch_unlikely(&page_alloc_shuffle_key))
> +	if (!static_branch_unlikely(&page_alloc_shuffle_key) ||
> +	    preferred_rank_enabled())
>  		return;
>  	__shuffle_free_memory(pgdat);
>  }
> @@ -20,7 +21,8 @@ static inline void shuffle_free_memory(pg_data_t *pgdat)
>  extern void __shuffle_zone(struct zone *z);
>  static inline void shuffle_zone(struct zone *z)
>  {
> -	if (!static_branch_unlikely(&page_alloc_shuffle_key))
> +	if (!static_branch_unlikely(&page_alloc_shuffle_key) ||
> +	    preferred_rank_enabled())
>  		return;
>  	__shuffle_zone(z);
>  }
>
Vlastimil Babka Aug. 3, 2020, 3:45 p.m. UTC | #2
On 8/3/20 9:57 AM, David Hildenbrand wrote:
> On 03.08.20 08:10, pullip.cho@samsung.com wrote:
>> From: Cho KyongHo <pullip.cho@samsung.com>
>> 
>> LPDDR5 introduces rank switch delay. If three successive DRAM accesses
>> happens and the first and the second ones access one rank and the last
>> access happens on the other rank, the latency of the last access will
>> be longer than the second one.
>> To address this panelty, we can sort the freelist so that a specific
>> rank is allocated prior to another rank. We expect the page allocator
>> can allocate the pages from the same rank successively with this
>> change. It will hopefully improves the proportion of the consecutive
>> memory accesses to the same rank.
> 
> This certainly needs performance numbers to justify ... and I am sorry,
> "hopefully improves" is not a valid justification :)
> 
> I can imagine that this works well initially, when there hasn't been a
> lot of memory fragmentation going on. But quickly after your system is
> under stress, I doubt this will be very useful. Proof me wrong. ;)

Agreed. The implementation of __preferred_rank() seems to be very simple and
optimistic.
I think these systems could perhaps better behave as NUMA with (interleaved)
nodes for each rank, then you immediately have all the mempolicies support etc
to achieve what you need? Of course there's some cost as well, but not the costs
of adding hacks to page allocator core?
Cho KyongHo Aug. 4, 2020, 2:35 a.m. UTC | #3
On Mon, Aug 03, 2020 at 05:45:55PM +0200, Vlastimil Babka wrote:
> On 8/3/20 9:57 AM, David Hildenbrand wrote:
> > On 03.08.20 08:10, pullip.cho@samsung.com wrote:
> >> From: Cho KyongHo <pullip.cho@samsung.com>
> >> 
> >> LPDDR5 introduces rank switch delay. If three successive DRAM accesses
> >> happens and the first and the second ones access one rank and the last
> >> access happens on the other rank, the latency of the last access will
> >> be longer than the second one.
> >> To address this panelty, we can sort the freelist so that a specific
> >> rank is allocated prior to another rank. We expect the page allocator
> >> can allocate the pages from the same rank successively with this
> >> change. It will hopefully improves the proportion of the consecutive
> >> memory accesses to the same rank.
> > 
> > This certainly needs performance numbers to justify ... and I am sorry,
> > "hopefully improves" is not a valid justification :)
> > 
> > I can imagine that this works well initially, when there hasn't been a
> > lot of memory fragmentation going on. But quickly after your system is
> > under stress, I doubt this will be very useful. Proof me wrong. ;)
> 
> Agreed. The implementation of __preferred_rank() seems to be very simple and
> optimistic.

DRAM rank is selected by CS bits from DRAM controllers. In the most systems
CS bits are alloated to specific bit fields in BUS address. For example,
If CS bit is allocated to bit[16] in bus (physical) address in two rank
system, all 16KiB with bit[16] = 1 are in the rank 1 and the others are
in the rank 0.
This patch is not beneficial to other system than the mobile devices
with LPDDR5. That is why the default behavior of this patch is noop.

> I think these systems could perhaps better behave as NUMA with (interleaved)
> nodes for each rank, then you immediately have all the mempolicies support etc
> to achieve what you need? Of course there's some cost as well, but not the costs
> of adding hacks to page allocator core?

Thank you for the proposal. NUMA will be helpful to allocate pages from
a specific rank programmatically. I should consider NUMA if rank
affinity should be also required.
However, page allocation overhead by this policy (page migration and
reclamation ect.) will give the users worse responsiveness. The intend
of this patch is to reduce rank switch delay optimistically without
hurting page allocation speed.
Vlastimil Babka Aug. 4, 2020, 9:12 a.m. UTC | #4
On 8/4/20 4:35 AM, Cho KyongHo wrote:
> On Mon, Aug 03, 2020 at 05:45:55PM +0200, Vlastimil Babka wrote:
>> On 8/3/20 9:57 AM, David Hildenbrand wrote:
>> > On 03.08.20 08:10, pullip.cho@samsung.com wrote:
>> >> From: Cho KyongHo <pullip.cho@samsung.com>
>> >> 
>> >> LPDDR5 introduces rank switch delay. If three successive DRAM accesses
>> >> happens and the first and the second ones access one rank and the last
>> >> access happens on the other rank, the latency of the last access will
>> >> be longer than the second one.
>> >> To address this panelty, we can sort the freelist so that a specific
>> >> rank is allocated prior to another rank. We expect the page allocator
>> >> can allocate the pages from the same rank successively with this
>> >> change. It will hopefully improves the proportion of the consecutive
>> >> memory accesses to the same rank.
>> > 
>> > This certainly needs performance numbers to justify ... and I am sorry,
>> > "hopefully improves" is not a valid justification :)
>> > 
>> > I can imagine that this works well initially, when there hasn't been a
>> > lot of memory fragmentation going on. But quickly after your system is
>> > under stress, I doubt this will be very useful. Proof me wrong. ;)
>> 
>> Agreed. The implementation of __preferred_rank() seems to be very simple and
>> optimistic.
> 
> DRAM rank is selected by CS bits from DRAM controllers. In the most systems
> CS bits are alloated to specific bit fields in BUS address. For example,
> If CS bit is allocated to bit[16] in bus (physical) address in two rank
> system, all 16KiB with bit[16] = 1 are in the rank 1 and the others are
> in the rank 0.
> This patch is not beneficial to other system than the mobile devices
> with LPDDR5. That is why the default behavior of this patch is noop.

Hmm, the patch requires at least pageblock_nr_pages, which is 2MB on x86 (dunno
about ARM), so 16KiB would be way too small. What are the actual granularities then?

>> I think these systems could perhaps better behave as NUMA with (interleaved)
>> nodes for each rank, then you immediately have all the mempolicies support etc
>> to achieve what you need? Of course there's some cost as well, but not the costs
>> of adding hacks to page allocator core?
> 
> Thank you for the proposal. NUMA will be helpful to allocate pages from
> a specific rank programmatically. I should consider NUMA if rank
> affinity should be also required.
> However, page allocation overhead by this policy (page migration and
> reclamation ect.) will give the users worse responsiveness. The intend
> of this patch is to reduce rank switch delay optimistically without
> hurting page allocation speed.

The problem is, without some control of page migration and reclaim, the simple
preference approach will not work after some uptime, as David suggested. It will
just mean that the preferred rank will be allocated first, then the
non-preferred rank (Linux will fill all unused memory with page cache if
possible), then reclaim will free memory from both ranks without any special
care, and new allocations will thus come from both ranks.
Cho KyongHo Aug. 4, 2020, 10:20 a.m. UTC | #5
On Tue, Aug 04, 2020 at 11:12:55AM +0200, Vlastimil Babka wrote:
> On 8/4/20 4:35 AM, Cho KyongHo wrote:
> > On Mon, Aug 03, 2020 at 05:45:55PM +0200, Vlastimil Babka wrote:
> >> On 8/3/20 9:57 AM, David Hildenbrand wrote:
> >> > On 03.08.20 08:10, pullip.cho@samsung.com wrote:
> >> >> From: Cho KyongHo <pullip.cho@samsung.com>
> >> >> 
> >> >> LPDDR5 introduces rank switch delay. If three successive DRAM accesses
> >> >> happens and the first and the second ones access one rank and the last
> >> >> access happens on the other rank, the latency of the last access will
> >> >> be longer than the second one.
> >> >> To address this panelty, we can sort the freelist so that a specific
> >> >> rank is allocated prior to another rank. We expect the page allocator
> >> >> can allocate the pages from the same rank successively with this
> >> >> change. It will hopefully improves the proportion of the consecutive
> >> >> memory accesses to the same rank.
> >> > 
> >> > This certainly needs performance numbers to justify ... and I am sorry,
> >> > "hopefully improves" is not a valid justification :)
> >> > 
> >> > I can imagine that this works well initially, when there hasn't been a
> >> > lot of memory fragmentation going on. But quickly after your system is
> >> > under stress, I doubt this will be very useful. Proof me wrong. ;)
> >> 
> >> Agreed. The implementation of __preferred_rank() seems to be very simple and
> >> optimistic.
> > 
> > DRAM rank is selected by CS bits from DRAM controllers. In the most systems
> > CS bits are alloated to specific bit fields in BUS address. For example,
> > If CS bit is allocated to bit[16] in bus (physical) address in two rank
> > system, all 16KiB with bit[16] = 1 are in the rank 1 and the others are
> > in the rank 0.
> > This patch is not beneficial to other system than the mobile devices
> > with LPDDR5. That is why the default behavior of this patch is noop.
> 
> Hmm, the patch requires at least pageblock_nr_pages, which is 2MB on x86 (dunno
> about ARM), so 16KiB would be way too small. What are the actual granularities then?

16KiB is just an example. pageblock_nr_pages is 4Mb on both ARM and
ARM64. __perferred_rank() works if rank granule >= 4MB.

> >> I think these systems could perhaps better behave as NUMA with (interleaved)
> >> nodes for each rank, then you immediately have all the mempolicies support etc
> >> to achieve what you need? Of course there's some cost as well, but not the costs
> >> of adding hacks to page allocator core?
> > 
> > Thank you for the proposal. NUMA will be helpful to allocate pages from
> > a specific rank programmatically. I should consider NUMA if rank
> > affinity should be also required.
> > However, page allocation overhead by this policy (page migration and
> > reclamation ect.) will give the users worse responsiveness. The intend
> > of this patch is to reduce rank switch delay optimistically without
> > hurting page allocation speed.
> 
> The problem is, without some control of page migration and reclaim, the simple
> preference approach will not work after some uptime, as David suggested. It will
> just mean that the preferred rank will be allocated first, then the
> non-preferred rank (Linux will fill all unused memory with page cache if
> possible), then reclaim will free memory from both ranks without any special
> care, and new allocations will thus come from both ranks.
> 
In fact, I did't considered about NUMA in that way. I now need to check
NUMA if it gives us the same result with this patch.

Thank you again for your comments about NUMA :)
NUMA
Pekka Enberg Aug. 7, 2020, 7:08 a.m. UTC | #6
Hi Cho and David,

On Mon, Aug 3, 2020 at 10:57 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 03.08.20 08:10, pullip.cho@samsung.com wrote:
> > From: Cho KyongHo <pullip.cho@samsung.com>
> >
> > LPDDR5 introduces rank switch delay. If three successive DRAM accesses
> > happens and the first and the second ones access one rank and the last
> > access happens on the other rank, the latency of the last access will
> > be longer than the second one.
> > To address this panelty, we can sort the freelist so that a specific
> > rank is allocated prior to another rank. We expect the page allocator
> > can allocate the pages from the same rank successively with this
> > change. It will hopefully improves the proportion of the consecutive
> > memory accesses to the same rank.
>
> This certainly needs performance numbers to justify ... and I am sorry,
> "hopefully improves" is not a valid justification :)
>
> I can imagine that this works well initially, when there hasn't been a
> lot of memory fragmentation going on. But quickly after your system is
> under stress, I doubt this will be very useful. Proof me wrong. ;)
>
> ... I dislike this manual setting of "dram_rank_granule". Yet another mm
> feature that can only be enabled by a magic command line parameter where
> users have to guess the right values.
>
> (side note, there have been similar research approaches to improve
> energy consumption by switching off ranks when not needed).

I was thinking of the exact same thing. PALLOC [1] comes to mind, but
perhaps there are more recent ones?

I also dislike the manual knob, but is there a way for the OS to
detect this by itself? My (perhaps outdated) understanding was that
the DRAM address mapping scheme, for example, is not exposed to the
OS.

I think having more knowledge of DRAM controller details in the OS
would be potentially beneficial for better page allocation policy, so
maybe try come up with something more generic, even if the fallback to
providing this information is a kernel command line option.

[1] http://cs-people.bu.edu/rmancuso/files/papers/palloc-rtas2014.pdf

- Pekka
David Hildenbrand Aug. 10, 2020, 7:32 a.m. UTC | #7
On 07.08.20 09:08, Pekka Enberg wrote:
> Hi Cho and David,
> 
> On Mon, Aug 3, 2020 at 10:57 AM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 03.08.20 08:10, pullip.cho@samsung.com wrote:
>>> From: Cho KyongHo <pullip.cho@samsung.com>
>>>
>>> LPDDR5 introduces rank switch delay. If three successive DRAM accesses
>>> happens and the first and the second ones access one rank and the last
>>> access happens on the other rank, the latency of the last access will
>>> be longer than the second one.
>>> To address this panelty, we can sort the freelist so that a specific
>>> rank is allocated prior to another rank. We expect the page allocator
>>> can allocate the pages from the same rank successively with this
>>> change. It will hopefully improves the proportion of the consecutive
>>> memory accesses to the same rank.
>>
>> This certainly needs performance numbers to justify ... and I am sorry,
>> "hopefully improves" is not a valid justification :)
>>
>> I can imagine that this works well initially, when there hasn't been a
>> lot of memory fragmentation going on. But quickly after your system is
>> under stress, I doubt this will be very useful. Proof me wrong. ;)
>>
>> ... I dislike this manual setting of "dram_rank_granule". Yet another mm
>> feature that can only be enabled by a magic command line parameter where
>> users have to guess the right values.
>>
>> (side note, there have been similar research approaches to improve
>> energy consumption by switching off ranks when not needed).
> 
> I was thinking of the exact same thing. PALLOC [1] comes to mind, but
> perhaps there are more recent ones?

A more recent one is "Footprint-Based DIMM Hotplug"
(https://dl.acm.org/doi/abs/10.1109/TC.2019.2945562), which triggers
memory onlinng/offlining from the kernel to disable banks where possible
(I don't think the approach is upstream material in that form).

Also, I stumbled over "Towards Practical Page Placement for a Green
Memory Manager" (https://ieeexplore.ieee.org/document/7397629),
proposing an adaptive buddy allocator that tries to keep complete banks
free in the buddy where possible. That approach sounded quite
interesting while skimming over the paper.
> 
> I also dislike the manual knob, but is there a way for the OS to
> detect this by itself? My (perhaps outdated) understanding was that
> the DRAM address mapping scheme, for example, is not exposed to the
> OS.

I guess one universal approach is by measuring access times ... not what
we might be looking for :)

> 
> I think having more knowledge of DRAM controller details in the OS
> would be potentially beneficial for better page allocation policy, so
> maybe try come up with something more generic, even if the fallback to
> providing this information is a kernel command line option.
> 
> [1] http://cs-people.bu.edu/rmancuso/files/papers/palloc-rtas2014.pdf
> 
> - Pekka
>
Cho KyongHo Aug. 18, 2020, 8:54 a.m. UTC | #8
On Mon, Aug 10, 2020 at 09:32:18AM +0200, David Hildenbrand wrote:
> On 07.08.20 09:08, Pekka Enberg wrote:
> > Hi Cho and David,
> > 
> > On Mon, Aug 3, 2020 at 10:57 AM David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 03.08.20 08:10, pullip.cho@samsung.com wrote:
> >>> From: Cho KyongHo <pullip.cho@samsung.com>
> >>>
> >>> LPDDR5 introduces rank switch delay. If three successive DRAM accesses
> >>> happens and the first and the second ones access one rank and the last
> >>> access happens on the other rank, the latency of the last access will
> >>> be longer than the second one.
> >>> To address this panelty, we can sort the freelist so that a specific
> >>> rank is allocated prior to another rank. We expect the page allocator
> >>> can allocate the pages from the same rank successively with this
> >>> change. It will hopefully improves the proportion of the consecutive
> >>> memory accesses to the same rank.
> >>
> >> This certainly needs performance numbers to justify ... and I am sorry,
> >> "hopefully improves" is not a valid justification :)
> >>
> >> I can imagine that this works well initially, when there hasn't been a
> >> lot of memory fragmentation going on. But quickly after your system is
> >> under stress, I doubt this will be very useful. Proof me wrong. ;)
> >>
> >> ... I dislike this manual setting of "dram_rank_granule". Yet another mm
> >> feature that can only be enabled by a magic command line parameter where
> >> users have to guess the right values.
> >>
> >> (side note, there have been similar research approaches to improve
> >> energy consumption by switching off ranks when not needed).
> > 
> > I was thinking of the exact same thing. PALLOC [1] comes to mind, but
> > perhaps there are more recent ones?
> 
> A more recent one is "Footprint-Based DIMM Hotplug"
> (https://protect2.fireeye.com/v1/url?k=adc28c8b-f0128447-adc307c4-000babff3793-131bb23ec7a60bc9&q=1&e=4c1c9d3c-07c1-4d9a-bb4a-510a0304194a&u=https%3A%2F%2Fdl.acm.org%2Fdoi%2Fabs%2F10.1109%2FTC.2019.2945562), which triggers
> memory onlinng/offlining from the kernel to disable banks where possible
> (I don't think the approach is upstream material in that form).
> 
> Also, I stumbled over "Towards Practical Page Placement for a Green
> Memory Manager" (https://ieeexplore.ieee.org/document/7397629),
> proposing an adaptive buddy allocator that tries to keep complete banks
> free in the buddy where possible. That approach sounded quite
> interesting while skimming over the paper.

The researches look like a linux support for partial array self-refresh.
Instead of choosing predefined memory array (bank, rank or segment) it
hot-removes in a channel(DIMM) granule.

Thank you for addressing the research. I need to look into the paper. I
also intersted in that area.

> > 
> > I also dislike the manual knob, but is there a way for the OS to
> > detect this by itself? My (perhaps outdated) understanding was that
> > the DRAM address mapping scheme, for example, is not exposed to the
> > OS.
> 
> I guess one universal approach is by measuring access times ... not what
> we might be looking for :)
> 
> > 
> > I think having more knowledge of DRAM controller details in the OS
> > would be potentially beneficial for better page allocation policy, so
> > maybe try come up with something more generic, even if the fallback to
> > providing this information is a kernel command line option.
> > 

I don't find if there is a way to deliver detailed DRAM information
through ACPI, ATAG or something similar. But I didn't find.
As you mentiond above, if the kernel has knowledge of DRAM controllers,
it would be also beneficial for power management as well as page allocations.
PASR has not come to Linux due to the complexity, for example. If kernel
knows the granule of hot-add/remove for PASR, we can start discussions
how to support PASR in Linux.

Thank you for the opinions.

KyongHo
Pekka Enberg Aug. 19, 2020, 1:12 p.m. UTC | #9
Hi KyongHo and David,

On 07.08.20 09:08, Pekka Enberg wrote:
> > > I think having more knowledge of DRAM controller details in the OS
> > > would be potentially beneficial for better page allocation policy, so
> > > maybe try come up with something more generic, even if the fallback to
> > > providing this information is a kernel command line option.

On Tue, Aug 18, 2020 at 12:02 PM Cho KyongHo <pullip.cho@samsung.com> wrote:
> I don't find if there is a way to deliver detailed DRAM information
> through ACPI, ATAG or something similar. But I didn't find.

Perhaps that's still the case today then.

In the context of this patch, what I am hoping is that we'd turn the
"dram_rank_granule" parameter -- even if it's still a manually
configured thing -- into an abstraction that we can later extend. IOW,
something similar to what the "energy model"
(kernel/power/energy_model.c) today is.

On Mon, Aug 10, 2020 at 09:32:18AM +0200, David Hildenbrand wrote:
> I guess one universal approach is by measuring access times ... not what
> we might be looking for :)

I don't think it's an unreasonable approach, but measurement accuracy
and speed will determine if it's actually practical. There are
examples of this kind of approach elsewhere too. For example, MCTOP
[1] which aims to provide topology-aware OS abstractions, uses cache
latency measurements to infer the topology.

1. https://timharris.uk/papers/2017-eurosys-mctop.pdf

Regards,

- Pekka
diff mbox series

Patch

diff --git a/mm/Kconfig b/mm/Kconfig
index 6c97488..789c02b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -868,4 +868,27 @@  config ARCH_HAS_HUGEPD
 config MAPPING_DIRTY_HELPERS
         bool
 
+config RANK_SORTED_FREELIST
+	bool "Prefer allocating free pages in a half of DRAM ranks"
+
+	help
+	  Rank switch delay in DRAM access become larger in LPDDR5 than before.
+	  If two successive memory accesses happen on different ranks in LPDDR5,
+	  the latency of the second access becomes longer due to the rank switch
+	  overhead. This is a power-performance tradeoff achieved in LPDDR5.
+	  By default, sorting freelist by rank number is disabled even though
+	  RANK_SORTED_FREELIST is set to y. To enable, set "dram_rank_granule"
+	  boot argument to a larger or an equal value to pageblock_nr_pages. The
+	  values should be the exact the rank interleaving granule that your
+	  system is using. The rank interleaving granule is 2^(the lowest CS bit
+	  number). CS stands for Chip Select and is also called SS which stands
+	  for Slave Select.
+	  This is not beneficial to single rank memory system. Also this is not
+	  necessary to quad rank and octal rank memory systems because they are
+	  not in LPDDR5 specifications.
+
+	  This is marked experimental because this disables freelist shuffling
+	  (SHUFFLE_PAGE_ALLOCATOR). Also you should set the correct rank
+	  interleaving granule.
+
 endmenu
diff --git a/mm/compaction.c b/mm/compaction.c
index 061dacf..bee9567 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1218,8 +1218,7 @@  move_freelist_head(struct list_head *freelist, struct page *freepage)
 
 	if (!list_is_last(freelist, &freepage->lru)) {
 		list_cut_before(&sublist, freelist, &freepage->lru);
-		if (!list_empty(&sublist))
-			list_splice_tail(&sublist, freelist);
+		freelist_splice_tail(&sublist, freelist);
 	}
 }
 
@@ -1236,8 +1235,7 @@  move_freelist_tail(struct list_head *freelist, struct page *freepage)
 
 	if (!list_is_first(freelist, &freepage->lru)) {
 		list_cut_position(&sublist, freelist, &freepage->lru);
-		if (!list_empty(&sublist))
-			list_splice_tail(&sublist, freelist);
+		freelist_splice_tail(&sublist, freelist);
 	}
 }
 
diff --git a/mm/internal.h b/mm/internal.h
index 10c6776..c945b3d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -266,6 +266,17 @@  int find_suitable_fallback(struct free_area *area, unsigned int order,
 
 #endif
 
+#ifdef CONFIG_RANK_SORTED_FREELIST
+void freelist_splice_tail(struct list_head *sublist, struct list_head *freelist);
+#else
+#include <linux/list.h>
+static inline
+void freelist_splice_tail(struct list_head *sublist, struct list_head *freelist)
+{
+	if (!list_empty(sublist))
+		list_splice_tail(sublist, freelist);
+}
+#endif
 /*
  * This function returns the order of a free page in the buddy system. In
  * general, page_zone(page)->lock must be held by the caller to prevent the
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2824e116..7823a3b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -854,6 +854,69 @@  compaction_capture(struct capture_control *capc, struct page *page,
 }
 #endif /* CONFIG_COMPACTION */
 
+#ifdef CONFIG_RANK_SORTED_FREELIST
+static unsigned long dram_rank_nr_pages __read_mostly;
+
+static inline bool preferred_rank_enabled(void)
+{
+	return dram_rank_nr_pages >= pageblock_nr_pages;
+}
+
+static int __init dram_rank_granule(char *buf)
+{
+	unsigned long val = (unsigned long)(memparse(buf, NULL) / PAGE_SIZE);
+
+	if (val < pageblock_nr_pages) {
+		pr_err("too small rank granule %lu\n", val);
+		return -EINVAL;
+	}
+
+	dram_rank_nr_pages = val;
+
+	return 0;
+}
+
+early_param("dram_rank_granule", dram_rank_granule);
+
+static inline bool __preferred_rank(struct page *page)
+{
+	return !(page_to_pfn(page) & dram_rank_nr_pages);
+}
+
+static inline bool preferred_rank(struct page *page)
+{
+	return !preferred_rank_enabled() || __preferred_rank(page);
+}
+
+void freelist_splice_tail(struct list_head *sublist, struct list_head *freelist)
+{
+	while (!list_empty(sublist)) {
+		struct page *page;
+
+		page = list_first_entry(sublist, struct page, lru);
+		if (!preferred_rank_enabled() || !__preferred_rank(page))
+			list_move_tail(&page->lru, freelist);
+		else
+			list_move(&page->lru, freelist);
+	}
+}
+#else
+static inline bool __preferred_rank(struct page *page)
+{
+	return true;
+}
+
+static inline bool preferred_rank(struct page *page)
+{
+	return true;
+}
+
+static inline bool preferred_rank_enabled(void)
+{
+	return false;
+}
+#endif
+
 /* Used for pages not on another list */
 static inline void add_to_free_list(struct page *page, struct zone *zone,
 				    unsigned int order, int migratetype)
@@ -880,7 +943,10 @@  static inline void move_to_free_list(struct page *page, struct zone *zone,
 {
 	struct free_area *area = &zone->free_area[order];
 
-	list_move(&page->lru, &area->free_list[migratetype]);
+	if (preferred_rank(page))
+		list_move(&page->lru, &area->free_list[migratetype]);
+	else
+		list_move_tail(&page->lru, &area->free_list[migratetype]);
 }
 
 static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -1029,7 +1095,9 @@  static inline void __free_one_page(struct page *page,
 done_merging:
 	set_page_order(page, order);
 
-	if (is_shuffle_order(order))
+	if (preferred_rank_enabled())
+		to_tail = !__preferred_rank(page);
+	else if (is_shuffle_order(order))
 		to_tail = shuffle_pick_tail();
 	else
 		to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
@@ -2257,20 +2325,29 @@  static __always_inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 						int migratetype)
 {
+	int retry = preferred_rank_enabled() ? 2 : 1;
 	unsigned int current_order;
 	struct free_area *area;
 	struct page *page;
 
-	/* Find a page of the appropriate size in the preferred list */
-	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
-		area = &(zone->free_area[current_order]);
-		page = get_page_from_free_area(area, migratetype);
-		if (!page)
-			continue;
-		del_page_from_free_list(page, zone, current_order);
-		expand(zone, page, order, current_order, migratetype);
-		set_pcppage_migratetype(page, migratetype);
-		return page;
+	while (retry-- > 0) {
+		/* Find a page of the appropriate size in the preferred list */
+		for (current_order = order; current_order < MAX_ORDER; ++current_order) {
+			area = &zone->free_area[current_order];
+			page = get_page_from_free_area(area, migratetype);
+			if (!page)
+				continue;
+			/*
+			 * In the first try, search for a page in the preferred
+			 * rank upward even though a free page is found.
+			 */
+			if (retry > 0 && !preferred_rank(page))
+				continue;
+			del_page_from_free_list(page, zone, current_order);
+			expand(zone, page, order, current_order, migratetype);
+			set_pcppage_migratetype(page, migratetype);
+			return page;
+		}
 	}
 
 	return NULL;
@@ -2851,8 +2928,14 @@  static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		 * head, thus also in the physical page order. This is useful
 		 * for IO devices that can merge IO requests if the physical
 		 * pages are ordered properly.
+		 * However, preferred_rank_enabled() is true, we always sort
+		 * freelists in the buddy and the pcp in the order of rank
+		 * number for the performance reason.
 		 */
-		list_add_tail(&page->lru, list);
+		if (preferred_rank_enabled() && __preferred_rank(page))
+			list_add(&page->lru, list);
+		else
+			list_add_tail(&page->lru, list);
 		alloced++;
 		if (is_migrate_cma(get_pcppage_migratetype(page)))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -3136,7 +3219,10 @@  static void free_unref_page_commit(struct page *page, unsigned long pfn)
 	}
 
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
-	list_add(&page->lru, &pcp->lists[migratetype]);
+	if (preferred_rank(page))
+		list_add(&page->lru, &pcp->lists[migratetype]);
+	else
+		list_add_tail(&page->lru, &pcp->lists[migratetype]);
 	pcp->count++;
 	if (pcp->count >= pcp->high) {
 		unsigned long batch = READ_ONCE(pcp->batch);
@@ -8813,7 +8899,10 @@  static void break_down_buddy_pages(struct zone *zone, struct page *page,
 			continue;
 
 		if (current_buddy != target) {
-			add_to_free_list(current_buddy, zone, high, migratetype);
+			if (preferred_rank(current_buddy))
+				add_to_free_list(current_buddy, zone, high, migratetype);
+			else
+				add_to_free_list_tail(current_buddy, zone, high, migratetype);
 			set_page_order(current_buddy, high);
 			page = next_page;
 		}
diff --git a/mm/shuffle.h b/mm/shuffle.h
index 71b784f..59cbfde 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -12,7 +12,8 @@  extern void __shuffle_free_memory(pg_data_t *pgdat);
 extern bool shuffle_pick_tail(void);
 static inline void shuffle_free_memory(pg_data_t *pgdat)
 {
-	if (!static_branch_unlikely(&page_alloc_shuffle_key))
+	if (!static_branch_unlikely(&page_alloc_shuffle_key) ||
+	    preferred_rank_enabled())
 		return;
 	__shuffle_free_memory(pgdat);
 }
@@ -20,7 +21,8 @@  static inline void shuffle_free_memory(pg_data_t *pgdat)
 extern void __shuffle_zone(struct zone *z);
 static inline void shuffle_zone(struct zone *z)
 {
-	if (!static_branch_unlikely(&page_alloc_shuffle_key))
+	if (!static_branch_unlikely(&page_alloc_shuffle_key) ||
+	    preferred_rank_enabled())
 		return;
 	__shuffle_zone(z);
 }