[v4,5/5] zswap: shrinks zswap pool based on memory pressure

Message ID	20231024203302.1920362-6-nphamcs@gmail.com (mailing list archive)
State	New
Headers	show Return-Path: <linux-kselftest-owner@vger.kernel.org> From: Nhat Pham <nphamcs@gmail.com> To: akpm@linux-foundation.org Cc: hannes@cmpxchg.org, cerasuolodomenico@gmail.com, yosryahmed@google.com, sjenning@redhat.com, ddstreet@ieee.org, vitaly.wool@konsulko.com, mhocko@kernel.org, roman.gushchin@linux.dev, shakeelb@google.com, muchun.song@linux.dev, chrisl@kernel.org, linux-mm@kvack.org, kernel-team@meta.com, linux-kernel@vger.kernel.org, cgroups@vger.kernel.org, linux-doc@vger.kernel.org, linux-kselftest@vger.kernel.org, shuah@kernel.org Subject: [PATCH v4 5/5] zswap: shrinks zswap pool based on memory pressure Date: Tue, 24 Oct 2023 13:33:02 -0700 Message-Id: <20231024203302.1920362-6-nphamcs@gmail.com> In-Reply-To: <20231024203302.1920362-1-nphamcs@gmail.com> References: <20231024203302.1920362-1-nphamcs@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	workload-specific and memory pressure-driven zswap writeback \| expand [v4,0/5] workload-specific and memory pressure-driven zswap writeback [v4,1/5] list_lru: allows explicit memcg and NUMA node selection [v4,2/5] zswap: make shrinking memcg-aware [v4,3/5] mm: memcg: add per-memcg zswap writeback stat [v4,4/5] selftests: cgroup: update per-memcg zswap writeback selftest [v4,5/5] zswap: shrinks zswap pool based on memory pressure

diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index 45b98390e938..522ae22ccb84 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -153,6 +153,13 @@ attribute, e. g.:: Setting this parameter to 100 will disable the hysteresis. +When there is a sizable amount of cold memory residing in the zswap pool, it +can be advantageous to proactively write these cold pages to swap and reclaim +the memory for other use cases. By default, the zswap shrinker is disabled. +User can enable it as follows: + + echo Y > /sys/module/zswap/parameters/shrinker_enabled + A debugfs interface is provided for various statistic about pool size, number of pages stored, same-value filled pages and various counters for the reasons pages are rejected. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 12f31633be05..633afdb96c40 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -22,6 +22,7 @@ #include <linux/mm_types.h> #include <linux/page-flags.h> #include <linux/local_lock.h> +#include <linux/zswap.h> #include <asm/page.h> /* Free memory management - zoned buddy allocator. */ @@ -637,6 +638,7 @@ struct lruvec { #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif + struct zswap_lruvec_state zswap_lruvec_state; }; /* Isolate for asynchronous migration */ diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 2a60ce39cfde..64d0c3644185 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -5,19 +5,39 @@ #include <linux/types.h> #include <linux/mm_types.h> +struct lruvec; + extern u64 zswap_pool_total_size; extern atomic_t zswap_stored_pages; #ifdef CONFIG_ZSWAP +struct zswap_lruvec_state { + /* + * Number of pages in zswap that should be protected from the shrinker. + * This number is an estimate of the following counts: + * + * a) Recent page faults. + * b) Recent insertion to the zswap LRU. This includes new zswap stores, + * as well as recent zswap LRU rotations. + * + * These pages are likely to be warm, and might incur IO if the are written + * to swap. + */ + atomic_long_t nr_zswap_protected; +}; + bool zswap_store(struct folio *folio); bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); void zswap_swapoff(int type); - +void zswap_lruvec_state_init(struct lruvec *lruvec); +void zswap_lruvec_swapin(struct page *page); #else +struct zswap_lruvec_state {}; + static inline bool zswap_store(struct folio *folio) { return false; @@ -31,7 +51,8 @@ static inline bool zswap_load(struct folio *folio) static inline void zswap_invalidate(int type, pgoff_t offset) {} static inline void zswap_swapon(int type) {} static inline void zswap_swapoff(int type) {} - +static inline void zswap_lruvec_init(struct lruvec *lruvec) {} +static inline void zswap_lruvec_swapin(struct page *page) {} #endif #endif /* _LINUX_ZSWAP_H */ diff --git a/mm/mmzone.c b/mm/mmzone.c index b594d3f268fe..c01896eca736 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -78,6 +78,7 @@ void lruvec_init(struct lruvec *lruvec) memset(lruvec, 0, sizeof(struct lruvec)); spin_lock_init(&lruvec->lru_lock); + zswap_lruvec_state_init(lruvec); for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); diff --git a/mm/swap_state.c b/mm/swap_state.c index 040639e1c77e..38ce9981b0be 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -686,6 +686,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); + zswap_lruvec_swapin(page); return page; } @@ -861,6 +862,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, &page_allocated, false); if (unlikely(page_allocated)) swap_readpage(page, false, NULL); + zswap_lruvec_swapin(page); return page; } diff --git a/mm/zswap.c b/mm/zswap.c index b87311e48de9..c40697f07ba3 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -146,6 +146,10 @@ module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); /* Number of zpools in zswap_pool (empirically determined for scalability) */ #define ZSWAP_NR_ZPOOLS 32 +/* Enable/disable memory pressure-based shrinker. */ +static bool zswap_shrinker_enabled; +module_param_named(shrinker_enabled, zswap_shrinker_enabled, bool, 0644); + /********************************* * data structures **********************************/ @@ -175,6 +179,8 @@ struct zswap_pool { char tfm_name[CRYPTO_MAX_ALG_NAME]; struct list_lru list_lru; struct mem_cgroup *next_shrink; + struct shrinker *shrinker; + atomic_t nr_stored; }; /* @@ -273,17 +279,26 @@ static bool zswap_can_accept(void) DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } +static u64 get_zswap_pool_size(struct zswap_pool *pool) +{ + u64 pool_size = 0; + int i; + + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + pool_size += zpool_get_total_size(pool->zpools[i]); + + return pool_size; +} + static void zswap_update_total_size(void) { struct zswap_pool *pool; u64 total = 0; - int i; rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) - total += zpool_get_total_size(pool->zpools[i]); + total += get_zswap_pool_size(pool); rcu_read_unlock(); @@ -321,6 +336,24 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } +/********************************* +* zswap lruvec functions +**********************************/ +void zswap_lruvec_state_init(struct lruvec *lruvec) +{ + atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0); +} + +void zswap_lruvec_swapin(struct page *page) +{ + struct lruvec *lruvec; + + if (page) { + lruvec = folio_lruvec(page_folio(page)); + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + } +} + /********************************* * lru functions **********************************/ @@ -328,8 +361,24 @@ static bool zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) { struct mem_cgroup *memcg = get_mem_cgroup_from_entry(entry); int nid = entry_to_nid(entry); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); bool added = list_lru_add(list_lru, &entry->lru, nid, memcg); + atomic_long_t *nr_zswap_protected = + &lruvec->zswap_lruvec_state.nr_zswap_protected; + unsigned long lru_size, old, new; + if (added) { + lru_size = list_lru_count_one(list_lru, nid, memcg); + old = atomic_long_inc_return(nr_zswap_protected); + + /* + * Decay to avoid overflow and adapt to changing workloads. + * This is based on LRU reclaim cost decaying heuristics. + */ + do { + new = old > lru_size / 4 ? old / 2 : old; + } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new)); + } mem_cgroup_put(memcg); return added; } @@ -430,6 +479,7 @@ static void zswap_free_entry(struct zswap_entry *entry) else { zswap_lru_del(&entry->pool->list_lru, entry); zpool_free(zswap_find_zpool(entry), entry->handle); + atomic_dec(&entry->pool->nr_stored); zswap_pool_put(entry->pool); } zswap_entry_cache_free(entry); @@ -471,6 +521,95 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, return entry; } +/********************************* +* shrinker functions +**********************************/ +static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, + spinlock_t *lock, void *arg); + +static unsigned long zswap_shrinker_scan(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); + unsigned long shrink_ret, nr_protected, lru_size; + struct zswap_pool *pool = shrinker->private_data; + bool encountered_page_in_swapcache = false; + + nr_protected = + atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); + lru_size = list_lru_shrink_count(&pool->list_lru, sc); + + /* + * Abort if the shrinker is disabled or if we are shrinking into the + * protected region. + */ + if (!zswap_shrinker_enabled || nr_protected >= lru_size - sc->nr_to_scan) { + sc->nr_scanned = 0; + return SHRINK_STOP; + } + + shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb, + &encountered_page_in_swapcache); + + if (encountered_page_in_swapcache) + return SHRINK_STOP; + + return shrink_ret ? shrink_ret : SHRINK_STOP; +} + +static unsigned long zswap_shrinker_count(struct shrinker *shrinker, + struct shrink_control *sc) +{ + struct zswap_pool *pool = shrinker->private_data; + struct mem_cgroup *memcg = sc->memcg; + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid)); + unsigned long nr_backing, nr_stored, nr_freeable, nr_protected; + +#ifdef CONFIG_MEMCG_KMEM + cgroup_rstat_flush(memcg->css.cgroup); + nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT; + nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); +#else + /* use pool stats instead of memcg stats */ + nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT; + nr_stored = atomic_read(&pool->nr_stored); +#endif + + if (!zswap_shrinker_enabled || !nr_stored) + return 0; + + nr_protected = + atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected); + nr_freeable = list_lru_shrink_count(&pool->list_lru, sc); + /* + * Subtract the lru size by an estimate of the number of pages + * that should be protected. + */ + nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0; + + /* + * Scale the number of freeable pages by the memory saving factor. + * This ensures that the better zswap compresses memory, the fewer + * pages we will evict to swap (as it will otherwise incur IO for + * relatively small memory saving). + */ + return mult_frac(nr_freeable, nr_backing, nr_stored); +} + +static void zswap_alloc_shrinker(struct zswap_pool *pool) +{ + pool->shrinker = + shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap"); + if (!pool->shrinker) + return; + + pool->shrinker->private_data = pool; + pool->shrinker->scan_objects = zswap_shrinker_scan; + pool->shrinker->count_objects = zswap_shrinker_count; + pool->shrinker->batch = 0; + pool->shrinker->seeks = DEFAULT_SEEKS; +} + /********************************* * per-cpu code **********************************/ @@ -666,8 +805,10 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o spinlock_t *lock, void *arg) { struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); + bool *encountered_page_in_swapcache = (bool *)arg; struct mem_cgroup *memcg; struct zswap_tree *tree; + struct lruvec *lruvec; pgoff_t swpoffset; enum lru_status ret = LRU_REMOVED_RETRY; int writeback_result; @@ -705,8 +846,22 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o /* we cannot use zswap_lru_add here, because it increments node's lru count */ list_lru_putback(&entry->pool->list_lru, item, entry_to_nid(entry), memcg); spin_unlock(lock); - mem_cgroup_put(memcg); ret = LRU_RETRY; + + /* + * Encountering a page already in swap cache is a sign that we are shrinking + * into the warmer region. We should terminate shrinking (if we're in the dynamic + * shrinker context). + */ + if (writeback_result == -EEXIST && encountered_page_in_swapcache) { + ret = LRU_SKIP; + *encountered_page_in_swapcache = true; + } + lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(entry_to_nid(entry))); + /* Increment the protection area to account for the LRU rotation. */ + atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected); + + mem_cgroup_put(memcg); goto put_unlock; } zswap_written_back_pages++; @@ -826,6 +981,11 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) &pool->node); if (ret) goto error; + + zswap_alloc_shrinker(pool); + if (!pool->shrinker) + goto error; + pr_debug("using %s compressor\n", pool->tfm_name); /* being the current pool takes 1 ref; this func expects the @@ -833,13 +993,19 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) */ kref_init(&pool->kref); INIT_LIST_HEAD(&pool->list); - list_lru_init_memcg(&pool->list_lru, NULL); + if (list_lru_init_memcg(&pool->list_lru, pool->shrinker)) + goto lru_fail; + shrinker_register(pool->shrinker); INIT_WORK(&pool->shrink_work, shrink_worker); + atomic_set(&pool->nr_stored, 0); zswap_pool_debug("created", pool); return pool; +lru_fail: + list_lru_destroy(&pool->list_lru); + shrinker_free(pool->shrinker); error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); @@ -897,6 +1063,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool) zswap_pool_debug("destroying", pool); + shrinker_free(pool->shrinker); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); list_lru_destroy(&pool->list_lru); @@ -1443,6 +1610,7 @@ bool zswap_store(struct folio *folio) if (entry->length) { INIT_LIST_HEAD(&entry->lru); zswap_lru_add(&entry->pool->list_lru, entry); + atomic_inc(&entry->pool->nr_stored); } spin_unlock(&tree->lock);

[v4,5/5] zswap: shrinks zswap pool based on memory pressure

Commit Message

Patch