diff mbox series

[v6,5/5] mm: THP low utilization shrinker

Message ID f4bd941ff7406ebad57ad7f47e7788ef12c6eb46.1667454613.git.alexlzhu@fb.com (mailing list archive)
State New
Headers show
Series THP Shrinker | expand

Commit Message

Alex Zhu (Kernel) Nov. 3, 2022, 6:01 a.m. UTC
From: Alexander Zhu <alexlzhu@fb.com>

This patch introduces a shrinker that will remove THPs in the lowest
utilization bucket. As previously mentioned, we have observed that
almost all of the memory waste when THPs are always enabled
is contained in the lowest utilization bucket. The shrinker will
add these THPs to a list_lru and split anonymous THPs based off
information from kswapd. It requires the changes from
thp_utilization to identify the least utilized THPs, and the
changes to split_huge_page to identify and free zero pages
within THPs.

Signed-off-by: Alexander Zhu <alexlzhu@fb.com>
---
 include/linux/huge_mm.h  |   9 ++++
 include/linux/list_lru.h |  24 +++++++++
 include/linux/mm_types.h |   5 ++
 mm/huge_memory.c         | 111 ++++++++++++++++++++++++++++++++++++++-
 mm/list_lru.c            |  49 +++++++++++++++++
 mm/page_alloc.c          |   6 +++
 mm/thp_utilization.c     |  16 ++++++
 7 files changed, 218 insertions(+), 2 deletions(-)

Comments

kernel test robot Nov. 3, 2022, 1:19 p.m. UTC | #1
Hi,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v6.1-rc3]
[cannot apply to shuah-kselftest/next next-20221103]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/alexlzhu-fb-com/THP-Shrinker/20221103-140549
patch link:    https://lore.kernel.org/r/f4bd941ff7406ebad57ad7f47e7788ef12c6eb46.1667454613.git.alexlzhu%40fb.com
patch subject: [PATCH v6 5/5] mm: THP low utilization shrinker
config: s390-randconfig-r001-20221102
compiler: clang version 15.0.4 (https://github.com/llvm/llvm-project 5c68a1cb123161b54b72ce90e7975d95a8eaf2a4)
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install s390 cross compiling tool for clang build
        # apt-get install binutils-s390x-linux-gnu
        # https://github.com/intel-lab-lkp/linux/commit/f74fc6ce0f2b2de5c0e302f7c3a20774b117ea68
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review alexlzhu-fb-com/THP-Shrinker/20221103-140549
        git checkout f74fc6ce0f2b2de5c0e302f7c3a20774b117ea68
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> mm/huge_memory.c:2970:1: warning: unused label 'out_unlock' [-Wunused-label]
   out_unlock:
   ^~~~~~~~~~~
   1 warning generated.


vim +/out_unlock +2970 mm/huge_memory.c

  2967	
  2968		list_lru_add_page(&huge_low_util_page_lru, page, page_underutilized_thp_list(page));
  2969	
> 2970	out_unlock:
  2971		unlock_page(page);
  2972	out_put:
  2973		put_page(page);
  2974	}
  2975
Tarun Sahu Jan. 6, 2023, 1:41 p.m. UTC | #2
Hi Alex,

I think it would better to have the 90% under utilization threshold for
shrinker to be defined as macro rather than hard coded?

like,

/*
 * The threshold to determine if thp is under utilized and can be
 * shrunk.
 */
#define THP_NRML_UTILIZED_BUCKET THP_UTIL_BUCKET_NR - 1

This way, it is more clear.

--skip
> +bool can_shrink_thp(struct folio *folio)
> +{
> +	int bucket, num_utilized_pages;
> +
> +	if (!folio || !folio_test_anon(folio) || !folio_test_transhuge(folio))
> +		return false;
> +
> +	num_utilized_pages = thp_number_utilized_pages(folio);
> +	bucket = thp_utilization_bucket(num_utilized_pages);
> +
> +	return bucket >= 0 && bucket < THP_UTIL_BUCKET_NR - 1;
This can be

         return bucket >= 0 && bucket < THP_NRML_UTILIZED_BUCKET;



> +}
> +
>  static void thp_scan_next_zone(void)
>  {
>  	struct timespec64 current_time;
> @@ -170,6 +183,9 @@ static void thp_util_scan(unsigned long pfn_end)
>  		if (bucket < 0)
>  			continue;
>  
> +		if (bucket < THP_UTIL_BUCKET_NR - 1)
Similarly, this one.

There might be other instances too.
> +			add_underutilized_thp(page);
> +
>  		thp_scan.buckets[bucket].nr_thps++;
>  		thp_scan.buckets[bucket].nr_zero_pages += (HPAGE_PMD_NR - num_utilized_pages);
>  	}
> -- 
> 2.30.2
diff mbox series

Patch

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a1341fdcf666..1745c94eb103 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -178,6 +178,8 @@  bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
 unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags);
 
+bool can_shrink_thp(struct folio *folio);
+
 void prep_transhuge_page(struct page *page);
 void free_transhuge_page(struct page *page);
 
@@ -189,6 +191,8 @@  static inline int split_huge_page(struct page *page)
 }
 void deferred_split_huge_page(struct page *page);
 
+void add_underutilized_thp(struct page *page);
+
 void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long address, bool freeze, struct folio *folio);
 
@@ -302,6 +306,11 @@  static inline struct list_head *page_deferred_list(struct page *page)
 	return &page[2].deferred_list;
 }
 
+static inline struct list_head *page_underutilized_thp_list(struct page *page)
+{
+	return &page[3].underutilized_thp_list;
+}
+
 #else /* CONFIG_TRANSPARENT_HUGEPAGE */
 #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
 #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index b35968ee9fb5..c2cf146ea880 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -89,6 +89,18 @@  void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
  */
 bool list_lru_add(struct list_lru *lru, struct list_head *item);
 
+/**
+ * list_lru_add_page: add an element to the lru list's tail
+ * @list_lru: the lru pointer
+ * @page: the page containing the item
+ * @item: the item to be deleted.
+ *
+ * This function works the same as list_lru_add in terms of list
+ * manipulation. Used for non slab objects contained in the page.
+ *
+ * Return value: true if the list was updated, false otherwise
+ */
+bool list_lru_add_page(struct list_lru *lru, struct page *page, struct list_head *item);
 /**
  * list_lru_del: delete an element to the lru list
  * @list_lru: the lru pointer
@@ -102,6 +114,18 @@  bool list_lru_add(struct list_lru *lru, struct list_head *item);
  */
 bool list_lru_del(struct list_lru *lru, struct list_head *item);
 
+/**
+ * list_lru_del_page: delete an element to the lru list
+ * @list_lru: the lru pointer
+ * @page: the page containing the item
+ * @item: the item to be deleted.
+ *
+ * This function works the same as list_lru_del in terms of list
+ * manipulation. Used for non slab objects contained in the page.
+ *
+ * Return value: true if the list was updated, false otherwise
+ */
+bool list_lru_del_page(struct list_lru *lru, struct page *page, struct list_head *item);
 /**
  * list_lru_count_one: return the number of objects currently held by @lru
  * @lru: the lru pointer.
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 500e536796ca..da1d1cf42158 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -152,6 +152,11 @@  struct page {
 			/* For both global and memcg */
 			struct list_head deferred_list;
 		};
+		struct { /* Third tail page of compound page */
+			unsigned long _compound_pad_3; /* compound_head */
+			unsigned long _compound_pad_4;
+			struct list_head underutilized_thp_list;
+		};
 		struct {	/* Page table pages */
 			unsigned long _pt_pad_1;	/* compound_head */
 			pgtable_t pmd_huge_pte; /* protected by page->ptl */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cba0bbbb2a93..012af3c3f1c7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -71,6 +71,8 @@  static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 unsigned long huge_zero_pfn __read_mostly = ~0UL;
 
+static struct list_lru huge_low_util_page_lru;
+
 bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
 			bool smaps, bool in_pf, bool enforce_sysfs)
 {
@@ -234,6 +236,53 @@  static struct shrinker huge_zero_page_shrinker = {
 	.seeks = DEFAULT_SEEKS,
 };
 
+static enum lru_status low_util_free_page(struct list_head *item,
+					  struct list_lru_one *lru,
+					  spinlock_t *lru_lock,
+					  void *cb_arg)
+{
+	struct folio *folio = page_folio(list_entry(item, struct page, underutilized_thp_list));
+	struct page *head = &folio->page;
+
+	if (get_page_unless_zero(head)) {
+		/* Inverse lock order from add_underutilized_thp() */
+		if (!trylock_page(head)) {
+			put_page(head);
+			return LRU_SKIP;
+		}
+		list_lru_isolate(lru, item);
+		spin_unlock_irq(lru_lock);
+		if (can_shrink_thp(folio))
+			split_huge_page(head);
+		spin_lock_irq(lru_lock);
+		unlock_page(head);
+		put_page(head);
+	}
+
+	return LRU_REMOVED_RETRY;
+}
+
+static unsigned long shrink_huge_low_util_page_count(struct shrinker *shrink,
+						     struct shrink_control *sc)
+{
+	return HPAGE_PMD_NR * list_lru_shrink_count(&huge_low_util_page_lru, sc);
+}
+
+static unsigned long shrink_huge_low_util_page_scan(struct shrinker *shrink,
+						    struct shrink_control *sc)
+{
+	return HPAGE_PMD_NR * list_lru_shrink_walk_irq(&huge_low_util_page_lru,
+							sc, low_util_free_page, NULL);
+}
+
+static struct shrinker huge_low_util_page_shrinker = {
+	.count_objects = shrink_huge_low_util_page_count,
+	.scan_objects = shrink_huge_low_util_page_scan,
+	.seeks = DEFAULT_SEEKS,
+	.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
+		SHRINKER_NONSLAB,
+};
+
 #ifdef CONFIG_SYSFS
 static ssize_t enabled_show(struct kobject *kobj,
 			    struct kobj_attribute *attr, char *buf)
@@ -485,6 +534,9 @@  static int __init hugepage_init(void)
 	if (err)
 		goto err_slab;
 
+	err = register_shrinker(&huge_low_util_page_shrinker, "thp-low-util");
+	if (err)
+		goto err_low_util_shrinker;
 	err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
 	if (err)
 		goto err_hzp_shrinker;
@@ -492,6 +544,9 @@  static int __init hugepage_init(void)
 	if (err)
 		goto err_split_shrinker;
 
+	err = list_lru_init_memcg(&huge_low_util_page_lru, &huge_low_util_page_shrinker);
+	if (err)
+		goto err_low_util_list_lru;
 	/*
 	 * By default disable transparent hugepages on smaller systems,
 	 * where the extra memory used could hurt more than TLB overhead
@@ -508,10 +563,14 @@  static int __init hugepage_init(void)
 
 	return 0;
 err_khugepaged:
+	list_lru_destroy(&huge_low_util_page_lru);
+err_low_util_list_lru:
 	unregister_shrinker(&deferred_split_shrinker);
 err_split_shrinker:
 	unregister_shrinker(&huge_zero_page_shrinker);
 err_hzp_shrinker:
+	unregister_shrinker(&huge_low_util_page_shrinker);
+err_low_util_shrinker:
 	khugepaged_destroy();
 err_slab:
 	hugepage_exit_sysfs(hugepage_kobj);
@@ -586,6 +645,7 @@  void prep_transhuge_page(struct page *page)
 	 */
 
 	INIT_LIST_HEAD(page_deferred_list(page));
+	INIT_LIST_HEAD(page_underutilized_thp_list(page));
 	set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
 }
 
@@ -2451,8 +2511,7 @@  static void __split_huge_page_tail(struct page *head, int tail,
 			 LRU_GEN_MASK | LRU_REFS_MASK));
 
 	/* ->mapping in first tail page is compound_mapcount */
-	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
-			page_tail);
+	VM_BUG_ON_PAGE(tail > 3 && page_tail->mapping != TAIL_MAPPING, page_tail);
 	page_tail->mapping = head->mapping;
 	page_tail->index = head->index + tail;
 
@@ -2669,6 +2728,7 @@  int split_huge_page_to_list(struct page *page, struct list_head *list)
 	struct folio *folio = page_folio(page);
 	struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page);
 	XA_STATE(xas, &folio->mapping->i_pages, folio->index);
+	struct list_head *underutilized_thp_list = page_underutilized_thp_list(&folio->page);
 	struct anon_vma *anon_vma = NULL;
 	struct address_space *mapping = NULL;
 	int extra_pins, ret;
@@ -2776,6 +2836,10 @@  int split_huge_page_to_list(struct page *page, struct list_head *list)
 			list_del(page_deferred_list(&folio->page));
 		}
 		spin_unlock(&ds_queue->split_queue_lock);
+		/* Frozen refs lock out additions, test can be lockless */
+		if (!list_empty(underutilized_thp_list))
+			list_lru_del_page(&huge_low_util_page_lru, &folio->page,
+					  underutilized_thp_list);
 		if (mapping) {
 			int nr = folio_nr_pages(folio);
 
@@ -2818,6 +2882,7 @@  int split_huge_page_to_list(struct page *page, struct list_head *list)
 void free_transhuge_page(struct page *page)
 {
 	struct deferred_split *ds_queue = get_deferred_split_queue(page);
+	struct list_head *underutilized_thp_list = page_underutilized_thp_list(page);
 	unsigned long flags;
 
 	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
@@ -2826,6 +2891,13 @@  void free_transhuge_page(struct page *page)
 		list_del(page_deferred_list(page));
 	}
 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+	/* A dead page cannot be re-added to the THP shrinker, test can be lockless */
+	if (!list_empty(underutilized_thp_list))
+		list_lru_del_page(&huge_low_util_page_lru, page, underutilized_thp_list);
+
+	if (PageLRU(page))
+		__folio_clear_lru_flags(page_folio(page));
+
 	free_compound_page(page);
 }
 
@@ -2866,6 +2938,41 @@  void deferred_split_huge_page(struct page *page)
 	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 }
 
+void add_underutilized_thp(struct page *page)
+{
+	VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+	VM_BUG_ON_PAGE(!PageAnon(page), page);
+
+	/* hugetlbfs pages do not have an associated memcgroup */
+	if (PageHuge(page))
+		return;
+
+	/*
+	 * Need to take a reference on the page to prevent the page from getting free'd from
+	 * under us while we are adding the THP to the shrinker.
+	 */
+	if (!get_page_unless_zero(page))
+		return;
+
+	if (is_huge_zero_page(page))
+		goto out_put;
+
+	/* Stabilize page->memcg to allocate and add to the same list */
+	lock_page(page);
+
+#ifdef CONFIG_MEMCG_KMEM
+	if (memcg_list_lru_alloc(page_memcg(page), &huge_low_util_page_lru, GFP_KERNEL))
+		goto out_unlock;
+#endif
+
+	list_lru_add_page(&huge_low_util_page_lru, page, page_underutilized_thp_list(page));
+
+out_unlock:
+	unlock_page(page);
+out_put:
+	put_page(page);
+}
+
 static unsigned long deferred_split_count(struct shrinker *shrink,
 		struct shrink_control *sc)
 {
diff --git a/mm/list_lru.c b/mm/list_lru.c
index a05e5bef3b40..8cc56a84b554 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -140,6 +140,32 @@  bool list_lru_add(struct list_lru *lru, struct list_head *item)
 }
 EXPORT_SYMBOL_GPL(list_lru_add);
 
+bool list_lru_add_page(struct list_lru *lru, struct page *page, struct list_head *item)
+{
+	int nid = page_to_nid(page);
+	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
+	struct mem_cgroup *memcg;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nlru->lock, flags);
+	if (list_empty(item)) {
+		memcg = page_memcg(page);
+		l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
+		list_add_tail(item, &l->list);
+		/* Set shrinker bit if the first element was added */
+		if (!l->nr_items++)
+			set_shrinker_bit(memcg, nid,
+					 lru_shrinker_id(lru));
+		nlru->nr_items++;
+		spin_unlock_irqrestore(&nlru->lock, flags);
+		return true;
+	}
+	spin_unlock_irqrestore(&nlru->lock, flags);
+	return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_add_page);
+
 bool list_lru_del(struct list_lru *lru, struct list_head *item)
 {
 	int nid = page_to_nid(virt_to_page(item));
@@ -160,6 +186,29 @@  bool list_lru_del(struct list_lru *lru, struct list_head *item)
 }
 EXPORT_SYMBOL_GPL(list_lru_del);
 
+bool list_lru_del_page(struct list_lru *lru, struct page *page, struct list_head *item)
+{
+	int nid = page_to_nid(page);
+	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
+	struct mem_cgroup *memcg;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nlru->lock, flags);
+	if (!list_empty(item)) {
+		memcg = page_memcg(page);
+		l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
+		list_del_init(item);
+		l->nr_items--;
+		nlru->nr_items--;
+		spin_unlock_irqrestore(&nlru->lock, flags);
+		return true;
+	}
+	spin_unlock_irqrestore(&nlru->lock, flags);
+	return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_del_page);
+
 void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
 {
 	list_del_init(item);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 218b28ee49ed..dbb849e6e0e2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1336,6 +1336,12 @@  static int free_tail_pages_check(struct page *head_page, struct page *page)
 		 * deferred_list.next -- ignore value.
 		 */
 		break;
+	case 3:
+		/*
+		 * the third tail page: ->mapping is
+		 * underutilized_thp_list.next -- ignore value.
+		 */
+		break;
 	default:
 		if (page->mapping != TAIL_MAPPING) {
 			bad_page(page, "corrupted mapping in tail page");
diff --git a/mm/thp_utilization.c b/mm/thp_utilization.c
index cdbb7d5c9f39..0cb18f122c57 100644
--- a/mm/thp_utilization.c
+++ b/mm/thp_utilization.c
@@ -116,6 +116,19 @@  static int thp_number_utilized_pages(struct folio *folio)
 	return thp_nr_utilized_pages;
 }
 
+bool can_shrink_thp(struct folio *folio)
+{
+	int bucket, num_utilized_pages;
+
+	if (!folio || !folio_test_anon(folio) || !folio_test_transhuge(folio))
+		return false;
+
+	num_utilized_pages = thp_number_utilized_pages(folio);
+	bucket = thp_utilization_bucket(num_utilized_pages);
+
+	return bucket >= 0 && bucket < THP_UTIL_BUCKET_NR - 1;
+}
+
 static void thp_scan_next_zone(void)
 {
 	struct timespec64 current_time;
@@ -170,6 +183,9 @@  static void thp_util_scan(unsigned long pfn_end)
 		if (bucket < 0)
 			continue;
 
+		if (bucket < THP_UTIL_BUCKET_NR - 1)
+			add_underutilized_thp(page);
+
 		thp_scan.buckets[bucket].nr_thps++;
 		thp_scan.buckets[bucket].nr_zero_pages += (HPAGE_PMD_NR - num_utilized_pages);
 	}