@@ -178,6 +178,8 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags);
+bool can_shrink_thp(struct folio *folio);
+
void prep_transhuge_page(struct page *page);
void free_transhuge_page(struct page *page);
@@ -189,6 +191,8 @@ static inline int split_huge_page(struct page *page)
}
void deferred_split_huge_page(struct page *page);
+void add_underutilized_thp(struct page *page);
+
void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long address, bool freeze, struct folio *folio);
@@ -302,6 +306,11 @@ static inline struct list_head *page_deferred_list(struct page *page)
return &page[2].deferred_list;
}
+static inline struct list_head *page_underutilized_thp_list(struct page *page)
+{
+ return &page[3].underutilized_thp_list;
+}
+
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -89,6 +89,18 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
*/
bool list_lru_add(struct list_lru *lru, struct list_head *item);
+/**
+ * list_lru_add_page: add an element to the lru list's tail
+ * @list_lru: the lru pointer
+ * @page: the page containing the item
+ * @item: the item to be deleted.
+ *
+ * This function works the same as list_lru_add in terms of list
+ * manipulation. Used for non slab objects contained in the page.
+ *
+ * Return value: true if the list was updated, false otherwise
+ */
+bool list_lru_add_page(struct list_lru *lru, struct page *page, struct list_head *item);
/**
* list_lru_del: delete an element to the lru list
* @list_lru: the lru pointer
@@ -102,6 +114,18 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item);
*/
bool list_lru_del(struct list_lru *lru, struct list_head *item);
+/**
+ * list_lru_del_page: delete an element to the lru list
+ * @list_lru: the lru pointer
+ * @page: the page containing the item
+ * @item: the item to be deleted.
+ *
+ * This function works the same as list_lru_del in terms of list
+ * manipulation. Used for non slab objects contained in the page.
+ *
+ * Return value: true if the list was updated, false otherwise
+ */
+bool list_lru_del_page(struct list_lru *lru, struct page *page, struct list_head *item);
/**
* list_lru_count_one: return the number of objects currently held by @lru
* @lru: the lru pointer.
@@ -152,6 +152,11 @@ struct page {
/* For both global and memcg */
struct list_head deferred_list;
};
+ struct { /* Third tail page of compound page */
+ unsigned long _compound_pad_3; /* compound_head */
+ unsigned long _compound_pad_4;
+ struct list_head underutilized_thp_list;
+ };
struct { /* Page table pages */
unsigned long _pt_pad_1; /* compound_head */
pgtable_t pmd_huge_pte; /* protected by page->ptl */
@@ -71,6 +71,8 @@ static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
+static struct list_lru huge_low_util_page_lru;
+
bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
bool smaps, bool in_pf, bool enforce_sysfs)
{
@@ -234,6 +236,53 @@ static struct shrinker huge_zero_page_shrinker = {
.seeks = DEFAULT_SEEKS,
};
+static enum lru_status low_util_free_page(struct list_head *item,
+ struct list_lru_one *lru,
+ spinlock_t *lru_lock,
+ void *cb_arg)
+{
+ struct folio *folio = lru_to_folio(item);
+ struct page *head = &folio->page;
+
+ if (get_page_unless_zero(head)) {
+ /* Inverse lock order from add_underutilized_thp() */
+ if (!trylock_page(head)) {
+ put_page(head);
+ return LRU_SKIP;
+ }
+ list_lru_isolate(lru, item);
+ spin_unlock_irq(lru_lock);
+ if (can_shrink_thp(folio))
+ split_huge_page(head);
+ spin_lock_irq(lru_lock);
+ unlock_page(head);
+ put_page(head);
+ }
+
+ return LRU_REMOVED_RETRY;
+}
+
+static unsigned long shrink_huge_low_util_page_count(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ return HPAGE_PMD_NR * list_lru_shrink_count(&huge_low_util_page_lru, sc);
+}
+
+static unsigned long shrink_huge_low_util_page_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ return HPAGE_PMD_NR * list_lru_shrink_walk_irq(&huge_low_util_page_lru,
+ sc, low_util_free_page, NULL);
+}
+
+static struct shrinker huge_low_util_page_shrinker = {
+ .count_objects = shrink_huge_low_util_page_count,
+ .scan_objects = shrink_huge_low_util_page_scan,
+ .seeks = DEFAULT_SEEKS,
+ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
+ SHRINKER_NONSLAB,
+};
+
#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -485,6 +534,9 @@ static int __init hugepage_init(void)
if (err)
goto err_slab;
+ err = register_shrinker(&huge_low_util_page_shrinker, "thp-low-util");
+ if (err)
+ goto err_low_util_shrinker;
err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
if (err)
goto err_hzp_shrinker;
@@ -492,6 +544,9 @@ static int __init hugepage_init(void)
if (err)
goto err_split_shrinker;
+ err = list_lru_init_memcg(&huge_low_util_page_lru, &huge_low_util_page_shrinker);
+ if (err)
+ goto err_low_util_list_lru;
/*
* By default disable transparent hugepages on smaller systems,
* where the extra memory used could hurt more than TLB overhead
@@ -508,10 +563,14 @@ static int __init hugepage_init(void)
return 0;
err_khugepaged:
+ list_lru_destroy(&huge_low_util_page_lru);
+err_low_util_list_lru:
unregister_shrinker(&deferred_split_shrinker);
err_split_shrinker:
unregister_shrinker(&huge_zero_page_shrinker);
err_hzp_shrinker:
+ unregister_shrinker(&huge_low_util_page_shrinker);
+err_low_util_shrinker:
khugepaged_destroy();
err_slab:
hugepage_exit_sysfs(hugepage_kobj);
@@ -586,6 +645,7 @@ void prep_transhuge_page(struct page *page)
*/
INIT_LIST_HEAD(page_deferred_list(page));
+ INIT_LIST_HEAD(page_underutilized_thp_list(page));
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
@@ -2451,8 +2511,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
LRU_GEN_MASK | LRU_REFS_MASK));
/* ->mapping in first tail page is compound_mapcount */
- VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
- page_tail);
+ VM_BUG_ON_PAGE(tail > 3 && page_tail->mapping != TAIL_MAPPING, page_tail);
page_tail->mapping = head->mapping;
page_tail->index = head->index + tail;
page_tail->private = 0;
@@ -2660,6 +2719,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct folio *folio = page_folio(page);
struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page);
XA_STATE(xas, &folio->mapping->i_pages, folio->index);
+ struct list_head *underutilized_thp_list = page_underutilized_thp_list(&folio->page);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int extra_pins, ret;
@@ -2767,6 +2827,10 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
list_del(page_deferred_list(&folio->page));
}
spin_unlock(&ds_queue->split_queue_lock);
+ /* Frozen refs lock out additions, test can be lockless */
+ if (!list_empty(underutilized_thp_list))
+ list_lru_del_page(&huge_low_util_page_lru, &folio->page,
+ underutilized_thp_list);
if (mapping) {
int nr = folio_nr_pages(folio);
@@ -2809,6 +2873,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
void free_transhuge_page(struct page *page)
{
struct deferred_split *ds_queue = get_deferred_split_queue(page);
+ struct list_head *underutilized_thp_list = page_underutilized_thp_list(page);
unsigned long flags;
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
@@ -2817,6 +2882,13 @@ void free_transhuge_page(struct page *page)
list_del(page_deferred_list(page));
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+ /* A dead page cannot be re-added to the THP shrinker, test can be lockless */
+ if (!list_empty(underutilized_thp_list))
+ list_lru_del_page(&huge_low_util_page_lru, page, underutilized_thp_list);
+
+ if (PageLRU(page))
+ __folio_clear_lru_flags(page_folio(page));
+
free_compound_page(page);
}
@@ -2857,6 +2929,40 @@ void deferred_split_huge_page(struct page *page)
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}
+void add_underutilized_thp(struct page *page)
+{
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+
+ if (PageSwapCache(page))
+ return;
+
+ /*
+ * Need to take a reference on the page to prevent the page from getting free'd from
+ * under us while we are adding the THP to the shrinker.
+ */
+ if (!get_page_unless_zero(page))
+ return;
+
+ if (is_huge_zero_page(page))
+ goto out_put;
+
+ /* Stabilize page->memcg to allocate and add to the same list */
+ lock_page(page);
+
+#ifdef CONFIG_MEMCG_KMEM
+ if (memcg_list_lru_alloc(page_memcg(page), &huge_low_util_page_lru, GFP_KERNEL))
+ goto out_unlock;
+#endif
+
+ list_lru_add_page(&huge_low_util_page_lru, page, page_underutilized_thp_list(page));
+
+out_unlock:
+ unlock_page(page);
+out_put:
+ put_page(page);
+}
+
static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc)
{
@@ -140,6 +140,32 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
}
EXPORT_SYMBOL_GPL(list_lru_add);
+bool list_lru_add_page(struct list_lru *lru, struct page *page, struct list_head *item)
+{
+ int nid = page_to_nid(page);
+ struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
+ struct mem_cgroup *memcg;
+ unsigned long flags;
+
+ spin_lock_irqsave(&nlru->lock, flags);
+ if (list_empty(item)) {
+ memcg = page_memcg(page);
+ l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
+ list_add_tail(item, &l->list);
+ /* Set shrinker bit if the first element was added */
+ if (!l->nr_items++)
+ set_shrinker_bit(memcg, nid,
+ lru_shrinker_id(lru));
+ nlru->nr_items++;
+ spin_unlock_irqrestore(&nlru->lock, flags);
+ return true;
+ }
+ spin_unlock_irqrestore(&nlru->lock, flags);
+ return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_add_page);
+
bool list_lru_del(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
@@ -160,6 +186,29 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
}
EXPORT_SYMBOL_GPL(list_lru_del);
+bool list_lru_del_page(struct list_lru *lru, struct page *page, struct list_head *item)
+{
+ int nid = page_to_nid(page);
+ struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
+ struct mem_cgroup *memcg;
+ unsigned long flags;
+
+ spin_lock_irqsave(&nlru->lock, flags);
+ if (!list_empty(item)) {
+ memcg = page_memcg(page);
+ l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
+ list_del_init(item);
+ l->nr_items--;
+ nlru->nr_items--;
+ spin_unlock_irqrestore(&nlru->lock, flags);
+ return true;
+ }
+ spin_unlock_irqrestore(&nlru->lock, flags);
+ return false;
+}
+EXPORT_SYMBOL_GPL(list_lru_del_page);
+
void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
{
list_del_init(item);
@@ -1335,6 +1335,12 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
* deferred_list.next -- ignore value.
*/
break;
+ case 3:
+ /*
+ * the third tail page: ->mapping is
+ * underutilized_thp_list.next -- ignore value.
+ */
+ break;
default:
if (page->mapping != TAIL_MAPPING) {
bad_page(page, "corrupted mapping in tail page");
@@ -113,6 +113,19 @@ static int thp_number_utilized_pages(struct folio *folio)
return thp_nr_utilized_pages;
}
+bool can_shrink_thp(struct folio *folio)
+{
+ int bucket, num_utilized_pages;
+
+ if (!folio || !folio_test_anon(folio) || !folio_test_transhuge(folio))
+ return false;
+
+ num_utilized_pages = thp_number_utilized_pages(folio);
+ bucket = thp_utilization_bucket(num_utilized_pages);
+
+ return bucket < THP_UTIL_BUCKET_NR - 1;
+}
+
static void thp_scan_next_zone(void)
{
struct timespec64 current_time;