diff mbox series

[RFC,3/4] mm/page_alloc: introduce __GFP_PTE_MAPPED flag to allocate pte-mapped pages

Message ID 20210823132513.15836-4-rppt@kernel.org (mailing list archive)
State New
Headers show
Series mm/page_alloc: cache pte-mapped allocations | expand

Commit Message

Mike Rapoport Aug. 23, 2021, 1:25 p.m. UTC
From: Mike Rapoport <rppt@linux.ibm.com>

When __GFP_PTE_MAPPED flag is passed to an allocation request of order 0,
the allocated page will be mapped at PTE level in the direct map.

To reduce the direct map fragmentation, maintain a cache of 4K pages that
are already mapped at PTE level in the direct map. Whenever the cache
should be replenished, try to allocate 2M page and split it to 4K pages
to localize shutter of the direct map. If the allocation of 2M page fails,
fallback to a single page allocation at expense of the direct map
fragmentation.

The cache registers a shrinker that releases free pages from the cache to
the page allocator.

The __GFP_PTE_MAPPED and caching of 4K pages are enabled only if an
architecture selects ARCH_WANTS_PTE_MAPPED_CACHE in its Kconfig.

[
cache management are mostly copied from 
https://lore.kernel.org/lkml/20210505003032.489164-4-rick.p.edgecombe@intel.com/
]

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
---
 arch/Kconfig                    |   8 +
 arch/x86/Kconfig                |   1 +
 include/linux/gfp.h             |  11 +-
 include/linux/mm.h              |   2 +
 include/linux/pageblock-flags.h |  26 ++++
 init/main.c                     |   1 +
 mm/internal.h                   |   3 +-
 mm/page_alloc.c                 | 261 +++++++++++++++++++++++++++++++-
 8 files changed, 309 insertions(+), 4 deletions(-)

Comments

Edgecombe, Rick P Aug. 23, 2021, 8:29 p.m. UTC | #1
On Mon, 2021-08-23 at 16:25 +0300, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> When __GFP_PTE_MAPPED flag is passed to an allocation request of
> order 0,
> the allocated page will be mapped at PTE level in the direct map.
> 
> To reduce the direct map fragmentation, maintain a cache of 4K pages
> that
> are already mapped at PTE level in the direct map. Whenever the cache
> should be replenished, try to allocate 2M page and split it to 4K
> pages
> to localize shutter of the direct map. If the allocation of 2M page
> fails,
> fallback to a single page allocation at expense of the direct map
> fragmentation.
> 
> The cache registers a shrinker that releases free pages from the
> cache to
> the page allocator.
> 
> The __GFP_PTE_MAPPED and caching of 4K pages are enabled only if an
> architecture selects ARCH_WANTS_PTE_MAPPED_CACHE in its Kconfig.
> 
> [
> cache management are mostly copied from 
> 
https://lore.kernel.org/lkml/20210505003032.489164-4-rick.p.edgecombe@intel.com/
> ]
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>  arch/Kconfig                    |   8 +
>  arch/x86/Kconfig                |   1 +
>  include/linux/gfp.h             |  11 +-
>  include/linux/mm.h              |   2 +
>  include/linux/pageblock-flags.h |  26 ++++
>  init/main.c                     |   1 +
>  mm/internal.h                   |   3 +-
>  mm/page_alloc.c                 | 261
> +++++++++++++++++++++++++++++++-
>  8 files changed, 309 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 129df498a8e1..2db95331201b 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -243,6 +243,14 @@ config ARCH_HAS_SET_MEMORY
>  config ARCH_HAS_SET_DIRECT_MAP
>  	bool
>  
> +#
> +# Select if the architecture wants to minimize fragmentation of its
> +# direct/linear map cauesd by set_memory and set_direct_map
> operations
> +#
> +config ARCH_WANTS_PTE_MAPPED_CACHE
> +	bool
> +	depends on ARCH_HAS_SET_MEMORY || ARCH_HAS_SET_DIRECT_MAP
> +
>  #
>  # Select if the architecture provides the arch_dma_set_uncached
> symbol to
>  # either provide an uncached segment alias for a DMA allocation, or
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 88fb922c23a0..9b4e6cf4a6aa 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -118,6 +118,7 @@ config X86
>  	select ARCH_WANTS_NO_INSTR
>  	select ARCH_WANT_HUGE_PMD_SHARE
>  	select ARCH_WANT_LD_ORPHAN_WARN
> +	select ARCH_WANTS_PTE_MAPPED_CACHE
>  	select ARCH_WANTS_THP_SWAP		if X86_64
>  	select BUILDTIME_TABLE_SORT
>  	select CLKEVT_I8253
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 55b2ec1f965a..c9006e3c67ad 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -55,8 +55,9 @@ struct vm_area_struct;
>  #define ___GFP_ACCOUNT		0x400000u
>  #define ___GFP_ZEROTAGS		0x800000u
>  #define ___GFP_SKIP_KASAN_POISON	0x1000000u
> +#define ___GFP_PTE_MAPPED	0x2000000u
>  #ifdef CONFIG_LOCKDEP
> -#define ___GFP_NOLOCKDEP	0x2000000u
> +#define ___GFP_NOLOCKDEP	0x4000000u
>  #else
>  #define ___GFP_NOLOCKDEP	0
>  #endif
> @@ -101,12 +102,18 @@ struct vm_area_struct;
>   * node with no fallbacks or placement policy enforcements.
>   *
>   * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
> + *
> + * %__GFP_PTE_MAPPED returns a page that is mapped with PTE in the
> + * direct map. On architectures that use higher page table levels to
> map
> + * physical memory, this flag will casue split of large pages in the
> direct
> + * mapping. Has effect only if CONFIG_ARCH_WANTS_PTE_MAPPED_CACHE is
> set.
>   */
>  #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
>  #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)
>  #define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL)
>  #define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)
>  #define __GFP_ACCOUNT	((__force gfp_t)___GFP_ACCOUNT)
> +#define __GFP_PTE_MAPPED ((__force gfp_t)___GFP_PTE_MAPPED)
>  
>  /**
>   * DOC: Watermark modifiers
> @@ -249,7 +256,7 @@ struct vm_area_struct;
>  #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
>  
>  /* Room for N __GFP_FOO bits */
> -#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
> +#define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP))
>  #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) -
> 1))
>  
>  /**
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 7ca22e6e694a..350ec98b82d2 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3283,5 +3283,7 @@ static inline int seal_check_future_write(int
> seals, struct vm_area_struct *vma)
>  	return 0;
>  }
>  
> +void pte_mapped_cache_init(void);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _LINUX_MM_H */
> diff --git a/include/linux/pageblock-flags.h
> b/include/linux/pageblock-flags.h
> index 973fd731a520..4faf8c542b00 100644
> --- a/include/linux/pageblock-flags.h
> +++ b/include/linux/pageblock-flags.h
> @@ -21,6 +21,8 @@ enum pageblock_bits {
>  			/* 3 bits required for migrate types */
>  	PB_migrate_skip,/* If set the block is skipped by compaction */
>  
> +	PB_pte_mapped, /* If set the block is mapped with PTEs in
> direct map */
> +
>  	/*
>  	 * Assume the bits will always align on a word. If this
> assumption
>  	 * changes then get/set pageblock needs updating.
> @@ -88,4 +90,28 @@ static inline void set_pageblock_skip(struct page
> *page)
>  }
>  #endif /* CONFIG_COMPACTION */
>  
> +#ifdef CONFIG_ARCH_WANTS_PTE_MAPPED_CACHE
> +#define get_pageblock_pte_mapped(page)				
> \
> +	get_pfnblock_flags_mask(page, page_to_pfn(page),	\
> +			(1 << (PB_pte_mapped)))
> +#define clear_pageblock_pte_mapped(page) \
> +	set_pfnblock_flags_mask(page, 0, page_to_pfn(page),	\
> +			(1 << PB_pte_mapped))
> +#define set_pageblock_pte_mapped(page) \
> +	set_pfnblock_flags_mask(page, (1 << PB_pte_mapped),	\
> +			page_to_pfn(page),			\
> +			(1 << PB_pte_mapped))
> +#else /* CONFIG_ARCH_WANTS_PTE_MAPPED_CACHE */
> +static inline bool get_pageblock_pte_mapped(struct page *page)
> +{
> +	return false;
> +}
> +static inline void clear_pageblock_pte_mapped(struct page *page)
> +{
> +}
> +static inline void set_pageblock_pte_mapped(struct page *page)
> +{
> +}
> +#endif /* CONFIG_ARCH_WANTS_PTE_MAPPED_CACHE */
> +
>  #endif	/* PAGEBLOCK_FLAGS_H */
> diff --git a/init/main.c b/init/main.c
> index f5b8246e8aa1..c0d59a183a39 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -828,6 +828,7 @@ static void __init mm_init(void)
>  	page_ext_init_flatmem_late();
>  	kmem_cache_init();
>  	kmemleak_init();
> +	pte_mapped_cache_init();
>  	pgtable_init();
>  	debug_objects_mem_init();
>  	vmalloc_init();
> diff --git a/mm/internal.h b/mm/internal.h
> index 31ff935b2547..0557ece6ebf4 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -24,7 +24,8 @@
>  			__GFP_ATOMIC)
>  
>  /* The GFP flags allowed during early boot */
> -#define GFP_BOOT_MASK (__GFP_BITS_MASK &
> ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
> +#define GFP_BOOT_MASK (__GFP_BITS_MASK &
> ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS|\
> +					   __GFP_PTE_MAPPED))
>  
>  /* Control allocation cpuset and node placement constraints */
>  #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 856b175c15a4..7936d8dcb80b 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -72,6 +72,7 @@
>  #include <linux/padata.h>
>  #include <linux/khugepaged.h>
>  #include <linux/buffer_head.h>
> +#include <linux/set_memory.h>
>  #include <asm/sections.h>
>  #include <asm/tlbflush.h>
>  #include <asm/div64.h>
> @@ -107,6 +108,14 @@ typedef int __bitwise fpi_t;
>   */
>  #define FPI_TO_TAIL		((__force fpi_t)BIT(1))
>  
> +/*
> + * Free page directly to the page allocator rather than check if it
> should
> + * be placed into the cache of pte_mapped pages.
> + * Used by the pte_mapped cache shrinker.
> + * Has effect only when pte-mapped cache is enabled
> + */
> +#define FPI_NO_PTE_MAP		((__force fpi_t)BIT(2))
> +
>  /*
>   * Don't poison memory with KASAN (only for the tag-based modes).
>   * During boot, all non-reserved memblock memory is exposed to
> page_alloc.
> @@ -225,6 +234,19 @@ static inline void
> set_pcppage_migratetype(struct page *page, int migratetype)
>  	page->index = migratetype;
>  }
>  
> +#ifdef CONFIG_ARCH_WANTS_PTE_MAPPED_CACHE
> +static struct page *alloc_page_pte_mapped(gfp_t gfp);
> +static void free_page_pte_mapped(struct page *page);
> +#else
> +static inline struct page *alloc_page_pte_mapped(gfp_t gfp)
> +{
> +	return NULL;
> +}
> +static void free_page_pte_mapped(struct page *page)
> +{
> +}
> +#endif
> +
>  #ifdef CONFIG_PM_SLEEP
>  /*
>   * The following functions are used by the suspend/hibernate code to
> temporarily
> @@ -536,7 +558,7 @@ void set_pfnblock_flags_mask(struct page *page,
> unsigned long flags,
>  	unsigned long bitidx, word_bitidx;
>  	unsigned long old_word, word;
>  
> -	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
> +	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 5);
>  	BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
>  
>  	bitmap = get_pageblock_bitmap(page, pfn);
> @@ -1352,6 +1374,16 @@ static __always_inline bool
> free_pages_prepare(struct page *page,
>  					   PAGE_SIZE << order);
>  	}
>  
> +	/*
> +	 * Unless are we shrinking pte_mapped cache return a page from
> +	 * a pageblock mapped with PTEs to that cache.
> +	 */
> +	if (!order && !(fpi_flags & FPI_NO_PTE_MAP) &&
> +	    get_pageblock_pte_mapped(page)) {
> +		free_page_pte_mapped(page);
> +		return false;
> +	}
> +
>  	kernel_poison_pages(page, 1 << order);
>  
>  	/*
> @@ -3445,6 +3477,13 @@ void free_unref_page_list(struct list_head
> *list)
>  	/* Prepare pages for freeing */
>  	list_for_each_entry_safe(page, next, list, lru) {
>  		pfn = page_to_pfn(page);
> +
> +		if (get_pageblock_pte_mapped(page)) {
> +			list_del(&page->lru);
> +			free_page_pte_mapped(page);
> +			continue;
> +		}
> +
>  		if (!free_unref_page_prepare(page, pfn, 0))
>  			list_del(&page->lru);
>  
> @@ -5381,6 +5420,12 @@ struct page *__alloc_pages(gfp_t gfp, unsigned
> int order, int preferred_nid,
>  			&alloc_gfp, &alloc_flags))
>  		return NULL;
>  
> +	if ((alloc_gfp & __GFP_PTE_MAPPED) && order == 0) {
> +		page = alloc_page_pte_mapped(alloc_gfp);
> +		if (page)
> +			goto out;
> +	}
> +
>  	/*
>  	 * Forbid the first pass from falling back to types that
> fragment
>  	 * memory until all local zones are considered.
> @@ -5681,6 +5726,220 @@ void free_pages_exact(void *virt, size_t
> size)
>  }
>  EXPORT_SYMBOL(free_pages_exact);
>  
> +#ifdef CONFIG_ARCH_WANTS_PTE_MAPPED_CACHE
> +
> +struct pte_mapped_cache {
> +	struct shrinker shrinker;
> +	struct list_lru lru;
> +	atomic_t nid_round_robin;
> +	unsigned long free_cnt;
> +};
> +
> +static struct pte_mapped_cache pte_mapped_cache;
> +
> +static struct page *pte_mapped_cache_get(struct pte_mapped_cache
> *cache)
> +{
> +	unsigned long start_nid, i;
> +	struct list_head *head;
> +	struct page *page = NULL;
> +
> +	start_nid = atomic_fetch_inc(&cache->nid_round_robin) %
> nr_node_ids;
> +	for (i = 0; i < nr_node_ids; i++) {
> +		int cur_nid = (start_nid + i) % nr_node_ids;
> +
> +		head = list_lru_get_mru(&cache->lru, cur_nid);
> +		if (head) {
> +			page = list_entry(head, struct page, lru);
> +			break;
> +		}
> +	}
> +
> +	return page;
> +}
> +
> +static void pte_mapped_cache_add(struct pte_mapped_cache *cache,
> +				 struct page *page)
> +{
> +	INIT_LIST_HEAD(&page->lru);
> +	list_lru_add_node(&cache->lru, &page->lru, page_to_nid(page));
> +	set_page_count(page, 0);
> +}
> +
> +static void pte_mapped_cache_add_neighbour_pages(struct page *page)
> +{
> +#if 0
> +	/*
> +	 * TODO: if pte_mapped_cache_replenish() had to fallback to
> order-0
> +	 * allocation, the large page in the direct map will be split
> +	 * anyway and if there are free pages in the same pageblock
> they
> +	 * can be added to pte_mapped cache.
> +	 */
> +	unsigned int order = (1 << HUGETLB_PAGE_ORDER);
> +	unsigned int nr_pages = (1 << order);
> +	unsigned long pfn = page_to_pfn(page);
> +	struct page *page_head = page - (pfn & (order - 1));
> +
> +	for (i = 0; i < nr_pages; i++) {
> +		page = page_head + i;
> +		if (is_free_buddy_page(page)) {
> +			take_page_off_buddy(page);
> +			pte_mapped_cache_add(&pte_mapped_cache, page);
> +		}
> +	}
> +#endif
> +}
> 
This seems a nice benefit of doing this sort of stuff in the page
allocator if it can work.

> +static struct page *pte_mapped_cache_replenish(struct
> pte_mapped_cache *cache,
> +					       gfp_t gfp)
> +{
> +	unsigned int order = HUGETLB_PAGE_ORDER;
> +	unsigned int nr_pages;
> +	struct page *page;
> +	int i, err;
> +
> +	gfp &= ~__GFP_PTE_MAPPED;
> +
> +	page = alloc_pages(gfp, order);
> +	if (!page) {
> +		order = 0;
> +		page = alloc_pages(gfp, order);
> +		if (!page)
> +			return NULL;
> +	}
> +
> +	nr_pages = (1 << order);
> +	err = set_memory_4k((unsigned long)page_address(page),
> nr_pages);
> +	if (err)
> +		goto err_free_pages;
> +
> +	if (order)
> +		split_page(page, order);
> +	else
> +		pte_mapped_cache_add_neighbour_pages(page);
> +
> +	for (i = 1; i < nr_pages; i++)
> +		pte_mapped_cache_add(cache, page + i);
> +
> +	set_pageblock_pte_mapped(page);
> +
> +	return page;
> +
> +err_free_pages:
> +	__free_pages(page, order);
> +	return NULL;
> +}
> +
> +static struct page *alloc_page_pte_mapped(gfp_t gfp)
I'm a little disappointed building into the page allocator didn't
automatically make higher order allocations easy. It seems this mostly
bolts the grouped pages code on to the page allocator and splits out of
the allocation/free paths to call into it?

I was thinking the main benefit of handling direct map permissions in
the page allocator would be re-using the buddy part to support high
order pages, etc. Did you try to build it in like that? If we can't get
that, what is the benefit to doing permission stuff in the pageallocator? 

> +{
> +	struct pte_mapped_cache *cache = &pte_mapped_cache;
> +	struct page *page;
> +
> +	page = pte_mapped_cache_get(cache);
> +	if (page) {
> +		prep_new_page(page, 0, gfp, 0);
> +		goto out;
> +	}
> +
> +	page = pte_mapped_cache_replenish(cache, gfp);
> +
> +out:
> +	return page;
> +}
> +
We probably want to exclude GFP_ATOMIC before calling into CPA unless
debug page alloc is on, because it may need to split and sleep for the
allocation. There is a page table allocation with GFP_ATOMIC passed
actually.

In my next series of this I added support for GFP_ATOMIC to this code,
but that solution should only work for permission changing grouped page
allocators in the protected page tables case where the direct map
tables are handled differently. As a general solution though (that's
the long term intention right?), GFP_ATOMIC might deserve some
consideration.

The other thing is we probably don't want to clean out the atomic
reserves and add them to a cache just for one page. I opted to just
convert one page in the GFP_ATOMIC case.

> +static void free_page_pte_mapped(struct page *page)
> +{
> +	pte_mapped_cache_add(&pte_mapped_cache, page);
> +}
> +
> +static struct pte_mapped_cache *pte_mapped_cache_from_sc(struct
> shrinker *sh)
> +{
> +	return container_of(sh, struct pte_mapped_cache, shrinker);
> +}
> +
> +static unsigned long pte_mapped_cache_shrink_count(struct shrinker
> *shrinker,
> +						   struct
> shrink_control *sc)
> +{
> +	struct pte_mapped_cache *cache =
> pte_mapped_cache_from_sc(shrinker);
> +	unsigned long count = list_lru_shrink_count(&cache->lru, sc);
> +
> +	return count ? count : SHRINK_EMPTY;
> +}
> +
> +static enum lru_status pte_mapped_cache_shrink_isolate(struct
> list_head *item,
> +						       struct
> list_lru_one *lst,
> +						       spinlock_t
> *lock,
> +						       void *cb_arg)
> +{
> +	struct list_head *freeable = cb_arg;
> +
> +	list_lru_isolate_move(lst, item, freeable);
> +
> +	return LRU_REMOVED;
> +}
> +
> +static unsigned long pte_mapped_cache_shrink_scan(struct shrinker
> *shrinker,
> +						  struct shrink_control
> *sc)
> +{
> +	struct pte_mapped_cache *cache =
> pte_mapped_cache_from_sc(shrinker);
> +	struct list_head *cur, *next;
> +	unsigned long isolated;
> +	LIST_HEAD(freeable);
> +
> +	isolated = list_lru_shrink_walk(&cache->lru, sc,
> +					pte_mapped_cache_shrink_isolate
> ,
> +					&freeable);
> +
> +	list_for_each_safe(cur, next, &freeable) {
> +		struct page *page = list_entry(cur, struct page, lru);
> +
> +		list_del(cur);
> +		__free_pages_ok(page, 0, FPI_NO_PTE_MAP);
> +	}
> +
> +	/* Every item walked gets isolated */
> +	sc->nr_scanned += isolated;
> +
> +	return isolated;
> +}
> +
> +static int __pte_mapped_cache_init(struct pte_mapped_cache *cache)
> +{
> +	int err;
> +
> +	err = list_lru_init(&cache->lru);
> +	if (err)
> +		return err;
> +
> +	cache->shrinker.count_objects = pte_mapped_cache_shrink_count;
> +	cache->shrinker.scan_objects = pte_mapped_cache_shrink_scan;
> +	cache->shrinker.seeks = DEFAULT_SEEKS;
> +	cache->shrinker.flags = SHRINKER_NUMA_AWARE;
> +
> +	err = register_shrinker(&cache->shrinker);
> +	if (err)
> +		goto err_list_lru_destroy;
> +
> +	return 0;
> +
> +err_list_lru_destroy:
> +	list_lru_destroy(&cache->lru);
> +	return err;
> +}
> +
> +void __init pte_mapped_cache_init(void)
> +{
> +	if (gfp_allowed_mask & __GFP_PTE_MAPPED)
> +		return;
> +
> +	if (!__pte_mapped_cache_init(&pte_mapped_cache))
> +		gfp_allowed_mask |= __GFP_PTE_MAPPED;
> +}
> +#else
> +void __init pte_mapped_cache_init(void)
> +{
> +}
> +#endif /* CONFIG_ARCH_WANTS_PTE_MAPPED_CACHE */
> +
>  /**
>   * nr_free_zone_pages - count number of pages beyond high watermark
>   * @offset: The zone index of the highest zone
Mike Rapoport Aug. 24, 2021, 1:02 p.m. UTC | #2
On Mon, Aug 23, 2021 at 08:29:49PM +0000, Edgecombe, Rick P wrote:
> On Mon, 2021-08-23 at 16:25 +0300, Mike Rapoport wrote:
> > From: Mike Rapoport <rppt@linux.ibm.com>
> > 
> > When __GFP_PTE_MAPPED flag is passed to an allocation request of
> > order 0,
> > the allocated page will be mapped at PTE level in the direct map.
> > 
> > To reduce the direct map fragmentation, maintain a cache of 4K pages
> > that
> > are already mapped at PTE level in the direct map. Whenever the cache
> > should be replenished, try to allocate 2M page and split it to 4K
> > pages
> > to localize shutter of the direct map. If the allocation of 2M page
> > fails,
> > fallback to a single page allocation at expense of the direct map
> > fragmentation.
> > 
> > The cache registers a shrinker that releases free pages from the
> > cache to
> > the page allocator.
> > 
> > The __GFP_PTE_MAPPED and caching of 4K pages are enabled only if an
> > architecture selects ARCH_WANTS_PTE_MAPPED_CACHE in its Kconfig.
> > 
> > [
> > cache management are mostly copied from 
> > 
> https://lore.kernel.org/lkml/20210505003032.489164-4-rick.p.edgecombe@intel.com/
> > ]
> > 
> > Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> > ---
> >  arch/Kconfig                    |   8 +
> >  arch/x86/Kconfig                |   1 +
> >  include/linux/gfp.h             |  11 +-
> >  include/linux/mm.h              |   2 +
> >  include/linux/pageblock-flags.h |  26 ++++
> >  init/main.c                     |   1 +
> >  mm/internal.h                   |   3 +-
> >  mm/page_alloc.c                 | 261 +++++++++++++++++++++++++++++++-
> >  8 files changed, 309 insertions(+), 4 deletions(-)
 
...

> > +static void pte_mapped_cache_add_neighbour_pages(struct page *page)
> > +{
> > +#if 0
> > +	/*
> > +	 * TODO: if pte_mapped_cache_replenish() had to fallback to
> > order-0
> > +	 * allocation, the large page in the direct map will be split
> > +	 * anyway and if there are free pages in the same pageblock
> > they
> > +	 * can be added to pte_mapped cache.
> > +	 */
> > +	unsigned int order = (1 << HUGETLB_PAGE_ORDER);
> > +	unsigned int nr_pages = (1 << order);
> > +	unsigned long pfn = page_to_pfn(page);
> > +	struct page *page_head = page - (pfn & (order - 1));
> > +
> > +	for (i = 0; i < nr_pages; i++) {
> > +		page = page_head + i;
> > +		if (is_free_buddy_page(page)) {
> > +			take_page_off_buddy(page);
> > +			pte_mapped_cache_add(&pte_mapped_cache, page);
> > +		}
> > +	}
> > +#endif
> > +}
> > 
> This seems a nice benefit of doing this sort of stuff in the page
> allocator if it can work.

I didn't try enable it yet, but I don't see a fundamental reason why this
won't work.
 
> > +static struct page *alloc_page_pte_mapped(gfp_t gfp)
> >
> I'm a little disappointed building into the page allocator didn't
> automatically make higher order allocations easy. It seems this mostly
> bolts the grouped pages code on to the page allocator and splits out of
> the allocation/free paths to call into it?
>
> I was thinking the main benefit of handling direct map permissions in
> the page allocator would be re-using the buddy part to support high
> order pages, etc. Did you try to build it in like that? If we can't get
> that, what is the benefit to doing permission stuff in the pageallocator? 

The addition of grouped pages to page allocator the way I did is somewhat
intermediate solution between keeping such cache entirely separate from
page allocator vs making it really tightly integrated, e.g. using a new
migratetype or doing more intrusive changes to page allocator. One of the
reasons I did it this way is to present various trade-offs because, tbh,
I'm not yet sure what's the best way to move forward. [The other reason
being my laziness, dropping your grouped pages code into the page allocator
was the simplest thing to do ;-)].

The immediate benefit of having this code close to the page allocator is
the simplification of the free path. Otherwise we'd need a cache-specific
free method or some information in struct page about how to free a grouped
page. Besides, it is possible to put pages mapped as 4k into such cache at
boot time when page allocator is initialized.

Also, keeping a central cache for multiple users will improve memory
utilization and I believe it would require less splits of the direct map.
OTOH, keeping such caches per-user allows managing access policy per cache
which could be better from the security POV.

I'm also going to explore the possibilities of using a new migratetype or
SL*B as Dave suggested.
 
> > +{
> > +	struct pte_mapped_cache *cache = &pte_mapped_cache;
> > +	struct page *page;
> > +
> > +	page = pte_mapped_cache_get(cache);
> > +	if (page) {
> > +		prep_new_page(page, 0, gfp, 0);
> > +		goto out;
> > +	}
> > +
> > +	page = pte_mapped_cache_replenish(cache, gfp);
> > +
> > +out:
> > +	return page;
> > +}
> > +
> We probably want to exclude GFP_ATOMIC before calling into CPA unless
> debug page alloc is on, because it may need to split and sleep for the
> allocation. There is a page table allocation with GFP_ATOMIC passed actually.

Looking at the callers of alloc_low_pages() it seems that GFP_ATOMIC there
is stale...
 
> In my next series of this I added support for GFP_ATOMIC to this code,
> but that solution should only work for permission changing grouped page
> allocators in the protected page tables case where the direct map
> tables are handled differently. As a general solution though (that's
> the long term intention right?), GFP_ATOMIC might deserve some
> consideration.

... but for the general solution GFP_ATOMIC indeed deserves some
consideration.
 
> The other thing is we probably don't want to clean out the atomic
> reserves and add them to a cache just for one page. I opted to just
> convert one page in the GFP_ATOMIC case.
 
Do you mean to allocate one page in GFP_ATOMIC case and bypass high order
allocation?
But the CPA split is still necessary here, isn't it?
Vlastimil Babka Aug. 24, 2021, 4:12 p.m. UTC | #3
On 8/23/21 15:25, Mike Rapoport wrote:
> @@ -3283,5 +3283,7 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
>  	return 0;
>  }
>  
> +void pte_mapped_cache_init(void);
> +
>  #endif /* __KERNEL__ */
>  #endif /* _LINUX_MM_H */
> diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
> index 973fd731a520..4faf8c542b00 100644
> --- a/include/linux/pageblock-flags.h
> +++ b/include/linux/pageblock-flags.h
> @@ -21,6 +21,8 @@ enum pageblock_bits {
>  			/* 3 bits required for migrate types */
>  	PB_migrate_skip,/* If set the block is skipped by compaction */
>  
> +	PB_pte_mapped, /* If set the block is mapped with PTEs in direct map */
> +
>  	/*
>  	 * Assume the bits will always align on a word. If this assumption
>  	 * changes then get/set pageblock needs updating.

You have broken this assumption :)
> @@ -536,7 +558,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
>  	unsigned long bitidx, word_bitidx;
>  	unsigned long old_word, word;
>  
> -	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
> +	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 5);

This is not sufficient to satisfy the "needs updating" part. We would now need
NR_PAGEBLOCK_BITS == 8.
Edgecombe, Rick P Aug. 24, 2021, 4:38 p.m. UTC | #4
On Tue, 2021-08-24 at 16:02 +0300, Mike Rapoport wrote:
> > We probably want to exclude GFP_ATOMIC before calling into CPA
> > unless
> > debug page alloc is on, because it may need to split and sleep for
> > the
> > allocation. There is a page table allocation with GFP_ATOMIC passed
> > actually.
> 
> Looking at the callers of alloc_low_pages() it seems that GFP_ATOMIC
> there
> is stale...
Well two actually, there is also spp_getpage(). I tried to determine if
that was also stale but wasn't confident. There were a lot of paths in.
>  
> > In my next series of this I added support for GFP_ATOMIC to this
> > code,
> > but that solution should only work for permission changing grouped
> > page
> > allocators in the protected page tables case where the direct map
> > tables are handled differently. As a general solution though
> > (that's
> > the long term intention right?), GFP_ATOMIC might deserve some
> > consideration.
> 
> ... but for the general solution GFP_ATOMIC indeed deserves some
> consideration.
>  
> > The other thing is we probably don't want to clean out the atomic
> > reserves and add them to a cache just for one page. I opted to just
> > convert one page in the GFP_ATOMIC case.
> 
>  
> Do you mean to allocate one page in GFP_ATOMIC case and bypass high
> order
> allocation?
> But the CPA split is still necessary here, isn't it?
Yes, grabs one atomic page and fragments it in the case of no pages in
the grouped page cache. The CPA split is necessary still, but it should
be ok because of the special way direct map page table allocations are
handled for pks tables. Has not been reviewed by anyone yet, and
wouldn't work as a general solution anyway.
Mike Rapoport Aug. 24, 2021, 4:54 p.m. UTC | #5
On Tue, Aug 24, 2021 at 04:38:03PM +0000, Edgecombe, Rick P wrote:
> On Tue, 2021-08-24 at 16:02 +0300, Mike Rapoport wrote:
> > > We probably want to exclude GFP_ATOMIC before calling into CPA
> > > unless
> > > debug page alloc is on, because it may need to split and sleep for
> > > the
> > > allocation. There is a page table allocation with GFP_ATOMIC passed
> > > actually.
> > 
> > Looking at the callers of alloc_low_pages() it seems that GFP_ATOMIC
> > there
> > is stale...
>
> Well two actually, there is also spp_getpage(). I tried to determine if
> that was also stale but wasn't confident. There were a lot of paths in.
  
It's also used at init and during memory hotplug, so I really doubt it
needs GFP_ATOMIC.

> > > In my next series of this I added support for GFP_ATOMIC to this
> > > code,
> > > but that solution should only work for permission changing grouped
> > > page
> > > allocators in the protected page tables case where the direct map
> > > tables are handled differently. As a general solution though
> > > (that's
> > > the long term intention right?), GFP_ATOMIC might deserve some
> > > consideration.
> > 
> > ... but for the general solution GFP_ATOMIC indeed deserves some
> > consideration.
> >  
> > > The other thing is we probably don't want to clean out the atomic
> > > reserves and add them to a cache just for one page. I opted to just
> > > convert one page in the GFP_ATOMIC case.
> >  
> > Do you mean to allocate one page in GFP_ATOMIC case and bypass high
> > order
> > allocation?
> > But the CPA split is still necessary here, isn't it?
>
> Yes, grabs one atomic page and fragments it in the case of no pages in
> the grouped page cache. The CPA split is necessary still, but it should
> be ok because of the special way direct map page table allocations are
> handled for pks tables. Has not been reviewed by anyone yet, and
> wouldn't work as a general solution anyway.
Edgecombe, Rick P Aug. 24, 2021, 5:23 p.m. UTC | #6
On Tue, 2021-08-24 at 19:54 +0300, Mike Rapoport wrote:
> On Tue, Aug 24, 2021 at 04:38:03PM +0000, Edgecombe, Rick P wrote:
> > On Tue, 2021-08-24 at 16:02 +0300, Mike Rapoport wrote:
> > > > We probably want to exclude GFP_ATOMIC before calling into CPA
> > > > unless
> > > > debug page alloc is on, because it may need to split and sleep
> > > > for
> > > > the
> > > > allocation. There is a page table allocation with GFP_ATOMIC
> > > > passed
> > > > actually.
> > > 
> > > Looking at the callers of alloc_low_pages() it seems that
> > > GFP_ATOMIC
> > > there
> > > is stale...
> > 
> > Well two actually, there is also spp_getpage(). I tried to
> > determine if
> > that was also stale but wasn't confident. There were a lot of paths
> > in.
> 
>   
> It's also used at init and during memory hotplug, so I really doubt
> it
> needs GFP_ATOMIC.

Pretty sure it gets called after init by at least something besides
hotplug. I saw it during debugging with a little sanitizer I built to
find any unprotected page tables missed. Something tweaking the fixmap
IIRC. Did you look at the set_fixmap_() and set_pte_vaddr() family of
functions? Now whether any of them actually need GFP_ATOMIC, I am less
sure. There were a fair amount of drivers to analyze.
Mike Rapoport Aug. 24, 2021, 5:37 p.m. UTC | #7
On Tue, Aug 24, 2021 at 05:23:04PM +0000, Edgecombe, Rick P wrote:
> On Tue, 2021-08-24 at 19:54 +0300, Mike Rapoport wrote:
> > On Tue, Aug 24, 2021 at 04:38:03PM +0000, Edgecombe, Rick P wrote:
> > > On Tue, 2021-08-24 at 16:02 +0300, Mike Rapoport wrote:
> > > > > We probably want to exclude GFP_ATOMIC before calling into CPA
> > > > > unless
> > > > > debug page alloc is on, because it may need to split and sleep
> > > > > for
> > > > > the
> > > > > allocation. There is a page table allocation with GFP_ATOMIC
> > > > > passed
> > > > > actually.
> > > > 
> > > > Looking at the callers of alloc_low_pages() it seems that
> > > > GFP_ATOMIC
> > > > there
> > > > is stale...
> > > 
> > > Well two actually, there is also spp_getpage(). I tried to
> > > determine if
> > > that was also stale but wasn't confident. There were a lot of paths
> > > in.
> > 
> >   
> > It's also used at init and during memory hotplug, so I really doubt
> > it
> > needs GFP_ATOMIC.
> 
> Pretty sure it gets called after init by at least something besides
> hotplug. I saw it during debugging with a little sanitizer I built to
> find any unprotected page tables missed. Something tweaking the fixmap
> IIRC. Did you look at the set_fixmap_() and set_pte_vaddr() family of
> functions? Now whether any of them actually need GFP_ATOMIC, I am less
> sure. There were a fair amount of drivers to analyze.

Oh, I've missed set_pte_vaddr(). I still doubt anything that uses those two
would need GFP_ATOMIC, but it's surely way harder to analyze.
David Hildenbrand Aug. 25, 2021, 8:43 a.m. UTC | #8
On 23.08.21 15:25, Mike Rapoport wrote:
> From: Mike Rapoport <rppt@linux.ibm.com>
> 
> When __GFP_PTE_MAPPED flag is passed to an allocation request of order 0,
> the allocated page will be mapped at PTE level in the direct map.
> 
> To reduce the direct map fragmentation, maintain a cache of 4K pages that
> are already mapped at PTE level in the direct map. Whenever the cache
> should be replenished, try to allocate 2M page and split it to 4K pages
> to localize shutter of the direct map. If the allocation of 2M page fails,
> fallback to a single page allocation at expense of the direct map
> fragmentation.
> 
> The cache registers a shrinker that releases free pages from the cache to
> the page allocator.
> 
> The __GFP_PTE_MAPPED and caching of 4K pages are enabled only if an
> architecture selects ARCH_WANTS_PTE_MAPPED_CACHE in its Kconfig.
> 
> [
> cache management are mostly copied from
> https://lore.kernel.org/lkml/20210505003032.489164-4-rick.p.edgecombe@intel.com/
> ]
> 
> Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
> ---
>   arch/Kconfig                    |   8 +
>   arch/x86/Kconfig                |   1 +
>   include/linux/gfp.h             |  11 +-
>   include/linux/mm.h              |   2 +
>   include/linux/pageblock-flags.h |  26 ++++
>   init/main.c                     |   1 +
>   mm/internal.h                   |   3 +-
>   mm/page_alloc.c                 | 261 +++++++++++++++++++++++++++++++-
>   8 files changed, 309 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 129df498a8e1..2db95331201b 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -243,6 +243,14 @@ config ARCH_HAS_SET_MEMORY
>   config ARCH_HAS_SET_DIRECT_MAP
>   	bool

[...]

> +static int __pte_mapped_cache_init(struct pte_mapped_cache *cache)
> +{
> +	int err;
> +
> +	err = list_lru_init(&cache->lru);
> +	if (err)
> +		return err;
> +
> +	cache->shrinker.count_objects = pte_mapped_cache_shrink_count;
> +	cache->shrinker.scan_objects = pte_mapped_cache_shrink_scan;
> +	cache->shrinker.seeks = DEFAULT_SEEKS;
> +	cache->shrinker.flags = SHRINKER_NUMA_AWARE;
> +
> +	err = register_shrinker(&cache->shrinker);
> +	if (err)
> +		goto err_list_lru_destroy;

With a shrinker in place, it really does somewhat feel like this should 
be a cache outside of the buddy. Or at least moved outside of 
page_alloc.c with a clean interface to work with the buddy.

But I only had a quick glimpse over this patch.
diff mbox series

Patch

diff --git a/arch/Kconfig b/arch/Kconfig
index 129df498a8e1..2db95331201b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -243,6 +243,14 @@  config ARCH_HAS_SET_MEMORY
 config ARCH_HAS_SET_DIRECT_MAP
 	bool
 
+#
+# Select if the architecture wants to minimize fragmentation of its
+# direct/linear map cauesd by set_memory and set_direct_map operations
+#
+config ARCH_WANTS_PTE_MAPPED_CACHE
+	bool
+	depends on ARCH_HAS_SET_MEMORY || ARCH_HAS_SET_DIRECT_MAP
+
 #
 # Select if the architecture provides the arch_dma_set_uncached symbol to
 # either provide an uncached segment alias for a DMA allocation, or
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 88fb922c23a0..9b4e6cf4a6aa 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -118,6 +118,7 @@  config X86
 	select ARCH_WANTS_NO_INSTR
 	select ARCH_WANT_HUGE_PMD_SHARE
 	select ARCH_WANT_LD_ORPHAN_WARN
+	select ARCH_WANTS_PTE_MAPPED_CACHE
 	select ARCH_WANTS_THP_SWAP		if X86_64
 	select BUILDTIME_TABLE_SORT
 	select CLKEVT_I8253
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 55b2ec1f965a..c9006e3c67ad 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -55,8 +55,9 @@  struct vm_area_struct;
 #define ___GFP_ACCOUNT		0x400000u
 #define ___GFP_ZEROTAGS		0x800000u
 #define ___GFP_SKIP_KASAN_POISON	0x1000000u
+#define ___GFP_PTE_MAPPED	0x2000000u
 #ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP	0x2000000u
+#define ___GFP_NOLOCKDEP	0x4000000u
 #else
 #define ___GFP_NOLOCKDEP	0
 #endif
@@ -101,12 +102,18 @@  struct vm_area_struct;
  * node with no fallbacks or placement policy enforcements.
  *
  * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
+ *
+ * %__GFP_PTE_MAPPED returns a page that is mapped with PTE in the
+ * direct map. On architectures that use higher page table levels to map
+ * physical memory, this flag will casue split of large pages in the direct
+ * mapping. Has effect only if CONFIG_ARCH_WANTS_PTE_MAPPED_CACHE is set.
  */
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
 #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)
 #define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL)
 #define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)
 #define __GFP_ACCOUNT	((__force gfp_t)___GFP_ACCOUNT)
+#define __GFP_PTE_MAPPED ((__force gfp_t)___GFP_PTE_MAPPED)
 
 /**
  * DOC: Watermark modifiers
@@ -249,7 +256,7 @@  struct vm_area_struct;
 #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
 
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP))
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /**
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7ca22e6e694a..350ec98b82d2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3283,5 +3283,7 @@  static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
 	return 0;
 }
 
+void pte_mapped_cache_init(void);
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index 973fd731a520..4faf8c542b00 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -21,6 +21,8 @@  enum pageblock_bits {
 			/* 3 bits required for migrate types */
 	PB_migrate_skip,/* If set the block is skipped by compaction */
 
+	PB_pte_mapped, /* If set the block is mapped with PTEs in direct map */
+
 	/*
 	 * Assume the bits will always align on a word. If this assumption
 	 * changes then get/set pageblock needs updating.
@@ -88,4 +90,28 @@  static inline void set_pageblock_skip(struct page *page)
 }
 #endif /* CONFIG_COMPACTION */
 
+#ifdef CONFIG_ARCH_WANTS_PTE_MAPPED_CACHE
+#define get_pageblock_pte_mapped(page)				\
+	get_pfnblock_flags_mask(page, page_to_pfn(page),	\
+			(1 << (PB_pte_mapped)))
+#define clear_pageblock_pte_mapped(page) \
+	set_pfnblock_flags_mask(page, 0, page_to_pfn(page),	\
+			(1 << PB_pte_mapped))
+#define set_pageblock_pte_mapped(page) \
+	set_pfnblock_flags_mask(page, (1 << PB_pte_mapped),	\
+			page_to_pfn(page),			\
+			(1 << PB_pte_mapped))
+#else /* CONFIG_ARCH_WANTS_PTE_MAPPED_CACHE */
+static inline bool get_pageblock_pte_mapped(struct page *page)
+{
+	return false;
+}
+static inline void clear_pageblock_pte_mapped(struct page *page)
+{
+}
+static inline void set_pageblock_pte_mapped(struct page *page)
+{
+}
+#endif /* CONFIG_ARCH_WANTS_PTE_MAPPED_CACHE */
+
 #endif	/* PAGEBLOCK_FLAGS_H */
diff --git a/init/main.c b/init/main.c
index f5b8246e8aa1..c0d59a183a39 100644
--- a/init/main.c
+++ b/init/main.c
@@ -828,6 +828,7 @@  static void __init mm_init(void)
 	page_ext_init_flatmem_late();
 	kmem_cache_init();
 	kmemleak_init();
+	pte_mapped_cache_init();
 	pgtable_init();
 	debug_objects_mem_init();
 	vmalloc_init();
diff --git a/mm/internal.h b/mm/internal.h
index 31ff935b2547..0557ece6ebf4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,7 +24,8 @@ 
 			__GFP_ATOMIC)
 
 /* The GFP flags allowed during early boot */
-#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
+#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS|\
+					   __GFP_PTE_MAPPED))
 
 /* Control allocation cpuset and node placement constraints */
 #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 856b175c15a4..7936d8dcb80b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -72,6 +72,7 @@ 
 #include <linux/padata.h>
 #include <linux/khugepaged.h>
 #include <linux/buffer_head.h>
+#include <linux/set_memory.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -107,6 +108,14 @@  typedef int __bitwise fpi_t;
  */
 #define FPI_TO_TAIL		((__force fpi_t)BIT(1))
 
+/*
+ * Free page directly to the page allocator rather than check if it should
+ * be placed into the cache of pte_mapped pages.
+ * Used by the pte_mapped cache shrinker.
+ * Has effect only when pte-mapped cache is enabled
+ */
+#define FPI_NO_PTE_MAP		((__force fpi_t)BIT(2))
+
 /*
  * Don't poison memory with KASAN (only for the tag-based modes).
  * During boot, all non-reserved memblock memory is exposed to page_alloc.
@@ -225,6 +234,19 @@  static inline void set_pcppage_migratetype(struct page *page, int migratetype)
 	page->index = migratetype;
 }
 
+#ifdef CONFIG_ARCH_WANTS_PTE_MAPPED_CACHE
+static struct page *alloc_page_pte_mapped(gfp_t gfp);
+static void free_page_pte_mapped(struct page *page);
+#else
+static inline struct page *alloc_page_pte_mapped(gfp_t gfp)
+{
+	return NULL;
+}
+static void free_page_pte_mapped(struct page *page)
+{
+}
+#endif
+
 #ifdef CONFIG_PM_SLEEP
 /*
  * The following functions are used by the suspend/hibernate code to temporarily
@@ -536,7 +558,7 @@  void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
 	unsigned long bitidx, word_bitidx;
 	unsigned long old_word, word;
 
-	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 5);
 	BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
 
 	bitmap = get_pageblock_bitmap(page, pfn);
@@ -1352,6 +1374,16 @@  static __always_inline bool free_pages_prepare(struct page *page,
 					   PAGE_SIZE << order);
 	}
 
+	/*
+	 * Unless are we shrinking pte_mapped cache return a page from
+	 * a pageblock mapped with PTEs to that cache.
+	 */
+	if (!order && !(fpi_flags & FPI_NO_PTE_MAP) &&
+	    get_pageblock_pte_mapped(page)) {
+		free_page_pte_mapped(page);
+		return false;
+	}
+
 	kernel_poison_pages(page, 1 << order);
 
 	/*
@@ -3445,6 +3477,13 @@  void free_unref_page_list(struct list_head *list)
 	/* Prepare pages for freeing */
 	list_for_each_entry_safe(page, next, list, lru) {
 		pfn = page_to_pfn(page);
+
+		if (get_pageblock_pte_mapped(page)) {
+			list_del(&page->lru);
+			free_page_pte_mapped(page);
+			continue;
+		}
+
 		if (!free_unref_page_prepare(page, pfn, 0))
 			list_del(&page->lru);
 
@@ -5381,6 +5420,12 @@  struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
 			&alloc_gfp, &alloc_flags))
 		return NULL;
 
+	if ((alloc_gfp & __GFP_PTE_MAPPED) && order == 0) {
+		page = alloc_page_pte_mapped(alloc_gfp);
+		if (page)
+			goto out;
+	}
+
 	/*
 	 * Forbid the first pass from falling back to types that fragment
 	 * memory until all local zones are considered.
@@ -5681,6 +5726,220 @@  void free_pages_exact(void *virt, size_t size)
 }
 EXPORT_SYMBOL(free_pages_exact);
 
+#ifdef CONFIG_ARCH_WANTS_PTE_MAPPED_CACHE
+
+struct pte_mapped_cache {
+	struct shrinker shrinker;
+	struct list_lru lru;
+	atomic_t nid_round_robin;
+	unsigned long free_cnt;
+};
+
+static struct pte_mapped_cache pte_mapped_cache;
+
+static struct page *pte_mapped_cache_get(struct pte_mapped_cache *cache)
+{
+	unsigned long start_nid, i;
+	struct list_head *head;
+	struct page *page = NULL;
+
+	start_nid = atomic_fetch_inc(&cache->nid_round_robin) % nr_node_ids;
+	for (i = 0; i < nr_node_ids; i++) {
+		int cur_nid = (start_nid + i) % nr_node_ids;
+
+		head = list_lru_get_mru(&cache->lru, cur_nid);
+		if (head) {
+			page = list_entry(head, struct page, lru);
+			break;
+		}
+	}
+
+	return page;
+}
+
+static void pte_mapped_cache_add(struct pte_mapped_cache *cache,
+				 struct page *page)
+{
+	INIT_LIST_HEAD(&page->lru);
+	list_lru_add_node(&cache->lru, &page->lru, page_to_nid(page));
+	set_page_count(page, 0);
+}
+
+static void pte_mapped_cache_add_neighbour_pages(struct page *page)
+{
+#if 0
+	/*
+	 * TODO: if pte_mapped_cache_replenish() had to fallback to order-0
+	 * allocation, the large page in the direct map will be split
+	 * anyway and if there are free pages in the same pageblock they
+	 * can be added to pte_mapped cache.
+	 */
+	unsigned int order = (1 << HUGETLB_PAGE_ORDER);
+	unsigned int nr_pages = (1 << order);
+	unsigned long pfn = page_to_pfn(page);
+	struct page *page_head = page - (pfn & (order - 1));
+
+	for (i = 0; i < nr_pages; i++) {
+		page = page_head + i;
+		if (is_free_buddy_page(page)) {
+			take_page_off_buddy(page);
+			pte_mapped_cache_add(&pte_mapped_cache, page);
+		}
+	}
+#endif
+}
+
+static struct page *pte_mapped_cache_replenish(struct pte_mapped_cache *cache,
+					       gfp_t gfp)
+{
+	unsigned int order = HUGETLB_PAGE_ORDER;
+	unsigned int nr_pages;
+	struct page *page;
+	int i, err;
+
+	gfp &= ~__GFP_PTE_MAPPED;
+
+	page = alloc_pages(gfp, order);
+	if (!page) {
+		order = 0;
+		page = alloc_pages(gfp, order);
+		if (!page)
+			return NULL;
+	}
+
+	nr_pages = (1 << order);
+	err = set_memory_4k((unsigned long)page_address(page), nr_pages);
+	if (err)
+		goto err_free_pages;
+
+	if (order)
+		split_page(page, order);
+	else
+		pte_mapped_cache_add_neighbour_pages(page);
+
+	for (i = 1; i < nr_pages; i++)
+		pte_mapped_cache_add(cache, page + i);
+
+	set_pageblock_pte_mapped(page);
+
+	return page;
+
+err_free_pages:
+	__free_pages(page, order);
+	return NULL;
+}
+
+static struct page *alloc_page_pte_mapped(gfp_t gfp)
+{
+	struct pte_mapped_cache *cache = &pte_mapped_cache;
+	struct page *page;
+
+	page = pte_mapped_cache_get(cache);
+	if (page) {
+		prep_new_page(page, 0, gfp, 0);
+		goto out;
+	}
+
+	page = pte_mapped_cache_replenish(cache, gfp);
+
+out:
+	return page;
+}
+
+static void free_page_pte_mapped(struct page *page)
+{
+	pte_mapped_cache_add(&pte_mapped_cache, page);
+}
+
+static struct pte_mapped_cache *pte_mapped_cache_from_sc(struct shrinker *sh)
+{
+	return container_of(sh, struct pte_mapped_cache, shrinker);
+}
+
+static unsigned long pte_mapped_cache_shrink_count(struct shrinker *shrinker,
+						   struct shrink_control *sc)
+{
+	struct pte_mapped_cache *cache = pte_mapped_cache_from_sc(shrinker);
+	unsigned long count = list_lru_shrink_count(&cache->lru, sc);
+
+	return count ? count : SHRINK_EMPTY;
+}
+
+static enum lru_status pte_mapped_cache_shrink_isolate(struct list_head *item,
+						       struct list_lru_one *lst,
+						       spinlock_t *lock,
+						       void *cb_arg)
+{
+	struct list_head *freeable = cb_arg;
+
+	list_lru_isolate_move(lst, item, freeable);
+
+	return LRU_REMOVED;
+}
+
+static unsigned long pte_mapped_cache_shrink_scan(struct shrinker *shrinker,
+						  struct shrink_control *sc)
+{
+	struct pte_mapped_cache *cache = pte_mapped_cache_from_sc(shrinker);
+	struct list_head *cur, *next;
+	unsigned long isolated;
+	LIST_HEAD(freeable);
+
+	isolated = list_lru_shrink_walk(&cache->lru, sc,
+					pte_mapped_cache_shrink_isolate,
+					&freeable);
+
+	list_for_each_safe(cur, next, &freeable) {
+		struct page *page = list_entry(cur, struct page, lru);
+
+		list_del(cur);
+		__free_pages_ok(page, 0, FPI_NO_PTE_MAP);
+	}
+
+	/* Every item walked gets isolated */
+	sc->nr_scanned += isolated;
+
+	return isolated;
+}
+
+static int __pte_mapped_cache_init(struct pte_mapped_cache *cache)
+{
+	int err;
+
+	err = list_lru_init(&cache->lru);
+	if (err)
+		return err;
+
+	cache->shrinker.count_objects = pte_mapped_cache_shrink_count;
+	cache->shrinker.scan_objects = pte_mapped_cache_shrink_scan;
+	cache->shrinker.seeks = DEFAULT_SEEKS;
+	cache->shrinker.flags = SHRINKER_NUMA_AWARE;
+
+	err = register_shrinker(&cache->shrinker);
+	if (err)
+		goto err_list_lru_destroy;
+
+	return 0;
+
+err_list_lru_destroy:
+	list_lru_destroy(&cache->lru);
+	return err;
+}
+
+void __init pte_mapped_cache_init(void)
+{
+	if (gfp_allowed_mask & __GFP_PTE_MAPPED)
+		return;
+
+	if (!__pte_mapped_cache_init(&pte_mapped_cache))
+		gfp_allowed_mask |= __GFP_PTE_MAPPED;
+}
+#else
+void __init pte_mapped_cache_init(void)
+{
+}
+#endif /* CONFIG_ARCH_WANTS_PTE_MAPPED_CACHE */
+
 /**
  * nr_free_zone_pages - count number of pages beyond high watermark
  * @offset: The zone index of the highest zone