diff mbox series

[1/2] mm/zswap: global lru and shrinker shared by all zswap_pools

Message ID 20240210-zswap-global-lru-v1-1-853473d7b0da@bytedance.com (mailing list archive)
State New
Headers show
Series mm/zswap: optimize for dynamic zswap_pools | expand

Commit Message

Chengming Zhou Feb. 11, 2024, 1:57 p.m. UTC
Dynamic zswap_pool creation may create/reuse to have multiple
zswap_pools in a list, only the first will be current used.

Each zswap_pool has its own lru and shrinker, which is not
necessary and has its problem:

1. When memory has pressure, all shrinker of zswap_pools will
   try to shrink its own lru, there is no order between them.

2. When zswap limit hit, only the last zswap_pool's shrink_work
   will try to shrink its lru, which is inefficient.

Anyway, having a global lru and shrinker shared by all zswap_pools
is better and efficient.

Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
---
 mm/zswap.c | 153 ++++++++++++++++++++++---------------------------------------
 1 file changed, 55 insertions(+), 98 deletions(-)

Comments

kernel test robot Feb. 11, 2024, 7:01 p.m. UTC | #1
Hi Chengming,

kernel test robot noticed the following build errors:

[auto build test ERROR on 191d97734e41a5c9f90a2f6636fdd335ae1d435d]

url:    https://github.com/intel-lab-lkp/linux/commits/Chengming-Zhou/mm-zswap-global-lru-and-shrinker-shared-by-all-zswap_pools/20240211-220028
base:   191d97734e41a5c9f90a2f6636fdd335ae1d435d
patch link:    https://lore.kernel.org/r/20240210-zswap-global-lru-v1-1-853473d7b0da%40bytedance.com
patch subject: [PATCH 1/2] mm/zswap: global lru and shrinker shared by all zswap_pools
config: x86_64-randconfig-013-20240211 (https://download.01.org/0day-ci/archive/20240212/202402120226.TK7G37U9-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240212/202402120226.TK7G37U9-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202402120226.TK7G37U9-lkp@intel.com/

All errors (new ones prefixed by >>):

   mm/zswap.c: In function 'zswap_shrinker_count':
>> mm/zswap.c:1300:42: error: 'pool' undeclared (first use in this function); did you mean 'zpool'?
    1300 |         nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
         |                                          ^~~~
         |                                          zpool
   mm/zswap.c:1300:42: note: each undeclared identifier is reported only once for each function it appears in

Kconfig warnings: (for reference only)
   WARNING: unmet direct dependencies detected for DRM_I915_DEBUG_GEM
   Depends on [n]: HAS_IOMEM [=y] && DRM_I915 [=y] && EXPERT [=y] && DRM_I915_WERROR [=n]
   Selected by [y]:
   - DRM_I915_DEBUG [=y] && HAS_IOMEM [=y] && DRM_I915 [=y] && EXPERT [=y] && !COMPILE_TEST [=n]


vim +1300 mm/zswap.c

b5ba474f3f5187 Nhat Pham      2023-11-30  1283  
b5ba474f3f5187 Nhat Pham      2023-11-30  1284  static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
b5ba474f3f5187 Nhat Pham      2023-11-30  1285  		struct shrink_control *sc)
b5ba474f3f5187 Nhat Pham      2023-11-30  1286  {
b5ba474f3f5187 Nhat Pham      2023-11-30  1287  	struct mem_cgroup *memcg = sc->memcg;
b5ba474f3f5187 Nhat Pham      2023-11-30  1288  	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
b5ba474f3f5187 Nhat Pham      2023-11-30  1289  	unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
b5ba474f3f5187 Nhat Pham      2023-11-30  1290  
501a06fe8e4c18 Nhat Pham      2023-12-07  1291  	if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
b5ba474f3f5187 Nhat Pham      2023-11-30  1292  		return 0;
b5ba474f3f5187 Nhat Pham      2023-11-30  1293  
b5ba474f3f5187 Nhat Pham      2023-11-30  1294  #ifdef CONFIG_MEMCG_KMEM
7d7ef0a4686abe Yosry Ahmed    2023-11-29  1295  	mem_cgroup_flush_stats(memcg);
b5ba474f3f5187 Nhat Pham      2023-11-30  1296  	nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
b5ba474f3f5187 Nhat Pham      2023-11-30  1297  	nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
b5ba474f3f5187 Nhat Pham      2023-11-30  1298  #else
b5ba474f3f5187 Nhat Pham      2023-11-30  1299  	/* use pool stats instead of memcg stats */
b5ba474f3f5187 Nhat Pham      2023-11-30 @1300  	nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
bf414d6ae81ba2 Chengming Zhou 2024-02-11  1301  	nr_stored = atomic_read(&zswap.nr_stored);
b5ba474f3f5187 Nhat Pham      2023-11-30  1302  #endif
b5ba474f3f5187 Nhat Pham      2023-11-30  1303  
b5ba474f3f5187 Nhat Pham      2023-11-30  1304  	if (!nr_stored)
b5ba474f3f5187 Nhat Pham      2023-11-30  1305  		return 0;
b5ba474f3f5187 Nhat Pham      2023-11-30  1306  
b5ba474f3f5187 Nhat Pham      2023-11-30  1307  	nr_protected =
b5ba474f3f5187 Nhat Pham      2023-11-30  1308  		atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
bf414d6ae81ba2 Chengming Zhou 2024-02-11  1309  	nr_freeable = list_lru_shrink_count(&zswap.list_lru, sc);
b5ba474f3f5187 Nhat Pham      2023-11-30  1310  	/*
b5ba474f3f5187 Nhat Pham      2023-11-30  1311  	 * Subtract the lru size by an estimate of the number of pages
b5ba474f3f5187 Nhat Pham      2023-11-30  1312  	 * that should be protected.
b5ba474f3f5187 Nhat Pham      2023-11-30  1313  	 */
b5ba474f3f5187 Nhat Pham      2023-11-30  1314  	nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
b5ba474f3f5187 Nhat Pham      2023-11-30  1315  
b5ba474f3f5187 Nhat Pham      2023-11-30  1316  	/*
b5ba474f3f5187 Nhat Pham      2023-11-30  1317  	 * Scale the number of freeable pages by the memory saving factor.
b5ba474f3f5187 Nhat Pham      2023-11-30  1318  	 * This ensures that the better zswap compresses memory, the fewer
b5ba474f3f5187 Nhat Pham      2023-11-30  1319  	 * pages we will evict to swap (as it will otherwise incur IO for
b5ba474f3f5187 Nhat Pham      2023-11-30  1320  	 * relatively small memory saving).
b5ba474f3f5187 Nhat Pham      2023-11-30  1321  	 */
b5ba474f3f5187 Nhat Pham      2023-11-30  1322  	return mult_frac(nr_freeable, nr_backing, nr_stored);
b5ba474f3f5187 Nhat Pham      2023-11-30  1323  }
b5ba474f3f5187 Nhat Pham      2023-11-30  1324
Nhat Pham Feb. 11, 2024, 9:04 p.m. UTC | #2
On Sun, Feb 11, 2024 at 5:57 AM Chengming Zhou
<zhouchengming@bytedance.com> wrote:
>
> Dynamic zswap_pool creation may create/reuse to have multiple
> zswap_pools in a list, only the first will be current used.
>
> Each zswap_pool has its own lru and shrinker, which is not
> necessary and has its problem:
>
> 1. When memory has pressure, all shrinker of zswap_pools will
>    try to shrink its own lru, there is no order between them.
>
> 2. When zswap limit hit, only the last zswap_pool's shrink_work
>    will try to shrink its lru, which is inefficient.
>
> Anyway, having a global lru and shrinker shared by all zswap_pools
> is better and efficient.
>
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>

I'll do a careful review later, but IMO this is a good idea :)

Chris pointed out when he reviewed the zswap shrinker patch series
that the reclaim algorithm has to decide which pool to reclaim from,
and I have always thought that it was a bit weird that we have to do
it at all. We should reclaim stored objects by access ordering,
irregardless of which pool it belongs to. Having a shared LRU and
other associated reclaim structures is sound, and saves a bit of space
too while we're at it.

> ---
>  mm/zswap.c | 153 ++++++++++++++++++++++---------------------------------------
>  1 file changed, 55 insertions(+), 98 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 62fe307521c9..7668db8c10e3 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -176,14 +176,17 @@ struct zswap_pool {
>         struct kref kref;
>         struct list_head list;
>         struct work_struct release_work;
> -       struct work_struct shrink_work;
>         struct hlist_node node;
>         char tfm_name[CRYPTO_MAX_ALG_NAME];
> +};
> +
> +struct {
>         struct list_lru list_lru;
> -       struct mem_cgroup *next_shrink;
> -       struct shrinker *shrinker;
>         atomic_t nr_stored;
> -};
> +       struct shrinker *shrinker;
> +       struct work_struct shrink_work;
> +       struct mem_cgroup *next_shrink;
> +} zswap;
>
>  /*
>   * struct zswap_entry
> @@ -301,9 +304,6 @@ static void zswap_update_total_size(void)
>  * pool functions
>  **********************************/
>
> -static void zswap_alloc_shrinker(struct zswap_pool *pool);
> -static void shrink_worker(struct work_struct *w);
> -
>  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>  {
>         int i;
> @@ -353,30 +353,16 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>         if (ret)
>                 goto error;
>
> -       zswap_alloc_shrinker(pool);
> -       if (!pool->shrinker)
> -               goto error;
> -
> -       pr_debug("using %s compressor\n", pool->tfm_name);
> -
>         /* being the current pool takes 1 ref; this func expects the
>          * caller to always add the new pool as the current pool
>          */
>         kref_init(&pool->kref);
>         INIT_LIST_HEAD(&pool->list);
> -       if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
> -               goto lru_fail;
> -       shrinker_register(pool->shrinker);
> -       INIT_WORK(&pool->shrink_work, shrink_worker);
> -       atomic_set(&pool->nr_stored, 0);
>
>         zswap_pool_debug("created", pool);
>
>         return pool;
>
> -lru_fail:
> -       list_lru_destroy(&pool->list_lru);
> -       shrinker_free(pool->shrinker);
>  error:
>         if (pool->acomp_ctx)
>                 free_percpu(pool->acomp_ctx);
> @@ -434,15 +420,8 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
>
>         zswap_pool_debug("destroying", pool);
>
> -       shrinker_free(pool->shrinker);
>         cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
>         free_percpu(pool->acomp_ctx);
> -       list_lru_destroy(&pool->list_lru);
> -
> -       spin_lock(&zswap_pools_lock);
> -       mem_cgroup_iter_break(NULL, pool->next_shrink);
> -       pool->next_shrink = NULL;
> -       spin_unlock(&zswap_pools_lock);
>
>         for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
>                 zpool_destroy_pool(pool->zpools[i]);
> @@ -529,24 +508,6 @@ static struct zswap_pool *zswap_pool_current_get(void)
>         return pool;
>  }
>
> -static struct zswap_pool *zswap_pool_last_get(void)
> -{
> -       struct zswap_pool *pool, *last = NULL;
> -
> -       rcu_read_lock();
> -
> -       list_for_each_entry_rcu(pool, &zswap_pools, list)
> -               last = pool;
> -       WARN_ONCE(!last && zswap_has_pool,
> -                 "%s: no page storage pool!\n", __func__);
> -       if (!zswap_pool_get(last))
> -               last = NULL;
> -
> -       rcu_read_unlock();
> -
> -       return last;
> -}
> -
>  /* type and compressor must be null-terminated */
>  static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
>  {
> @@ -816,14 +777,10 @@ void zswap_folio_swapin(struct folio *folio)
>
>  void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
>  {
> -       struct zswap_pool *pool;
> -
> -       /* lock out zswap pools list modification */
> +       /* lock out zswap shrinker walking memcg tree */
>         spin_lock(&zswap_pools_lock);
> -       list_for_each_entry(pool, &zswap_pools, list) {
> -               if (pool->next_shrink == memcg)
> -                       pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
> -       }
> +       if (zswap.next_shrink == memcg)
> +               zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL);
>         spin_unlock(&zswap_pools_lock);
>  }
>
> @@ -923,9 +880,9 @@ static void zswap_entry_free(struct zswap_entry *entry)
>         if (!entry->length)
>                 atomic_dec(&zswap_same_filled_pages);
>         else {
> -               zswap_lru_del(&entry->pool->list_lru, entry);
> +               zswap_lru_del(&zswap.list_lru, entry);
>                 zpool_free(zswap_find_zpool(entry), entry->handle);
> -               atomic_dec(&entry->pool->nr_stored);
> +               atomic_dec(&zswap.nr_stored);
>                 zswap_pool_put(entry->pool);
>         }
>         if (entry->objcg) {
> @@ -1288,7 +1245,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
>  {
>         struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
>         unsigned long shrink_ret, nr_protected, lru_size;
> -       struct zswap_pool *pool = shrinker->private_data;
>         bool encountered_page_in_swapcache = false;
>
>         if (!zswap_shrinker_enabled ||
> @@ -1299,7 +1255,7 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
>
>         nr_protected =
>                 atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
> -       lru_size = list_lru_shrink_count(&pool->list_lru, sc);
> +       lru_size = list_lru_shrink_count(&zswap.list_lru, sc);
>
>         /*
>          * Abort if we are shrinking into the protected region.
> @@ -1316,7 +1272,7 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
>                 return SHRINK_STOP;
>         }
>
> -       shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb,
> +       shrink_ret = list_lru_shrink_walk(&zswap.list_lru, sc, &shrink_memcg_cb,
>                 &encountered_page_in_swapcache);
>
>         if (encountered_page_in_swapcache)
> @@ -1328,7 +1284,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
>  static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
>                 struct shrink_control *sc)
>  {
> -       struct zswap_pool *pool = shrinker->private_data;
>         struct mem_cgroup *memcg = sc->memcg;
>         struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
>         unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
> @@ -1343,7 +1298,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
>  #else
>         /* use pool stats instead of memcg stats */
>         nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
> -       nr_stored = atomic_read(&pool->nr_stored);
> +       nr_stored = atomic_read(&zswap.nr_stored);
>  #endif
>
>         if (!nr_stored)
> @@ -1351,7 +1306,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
>
>         nr_protected =
>                 atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
> -       nr_freeable = list_lru_shrink_count(&pool->list_lru, sc);
> +       nr_freeable = list_lru_shrink_count(&zswap.list_lru, sc);
>         /*
>          * Subtract the lru size by an estimate of the number of pages
>          * that should be protected.
> @@ -1367,23 +1322,24 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
>         return mult_frac(nr_freeable, nr_backing, nr_stored);
>  }
>
> -static void zswap_alloc_shrinker(struct zswap_pool *pool)
> +static struct shrinker *zswap_alloc_shrinker(void)
>  {
> -       pool->shrinker =
> +       struct shrinker *shrinker;
> +
> +       shrinker =
>                 shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap");
> -       if (!pool->shrinker)
> -               return;
> +       if (!shrinker)
> +               return NULL;
>
> -       pool->shrinker->private_data = pool;
> -       pool->shrinker->scan_objects = zswap_shrinker_scan;
> -       pool->shrinker->count_objects = zswap_shrinker_count;
> -       pool->shrinker->batch = 0;
> -       pool->shrinker->seeks = DEFAULT_SEEKS;
> +       shrinker->scan_objects = zswap_shrinker_scan;
> +       shrinker->count_objects = zswap_shrinker_count;
> +       shrinker->batch = 0;
> +       shrinker->seeks = DEFAULT_SEEKS;
> +       return shrinker;
>  }
>
>  static int shrink_memcg(struct mem_cgroup *memcg)
>  {
> -       struct zswap_pool *pool;
>         int nid, shrunk = 0;
>
>         if (!mem_cgroup_zswap_writeback_enabled(memcg))
> @@ -1396,32 +1352,25 @@ static int shrink_memcg(struct mem_cgroup *memcg)
>         if (memcg && !mem_cgroup_online(memcg))
>                 return -ENOENT;
>
> -       pool = zswap_pool_current_get();
> -       if (!pool)
> -               return -EINVAL;
> -
>         for_each_node_state(nid, N_NORMAL_MEMORY) {
>                 unsigned long nr_to_walk = 1;
>
> -               shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg,
> +               shrunk += list_lru_walk_one(&zswap.list_lru, nid, memcg,
>                                             &shrink_memcg_cb, NULL, &nr_to_walk);
>         }
> -       zswap_pool_put(pool);
>         return shrunk ? 0 : -EAGAIN;
>  }
>
>  static void shrink_worker(struct work_struct *w)
>  {
> -       struct zswap_pool *pool = container_of(w, typeof(*pool),
> -                                               shrink_work);
>         struct mem_cgroup *memcg;
>         int ret, failures = 0;
>
>         /* global reclaim will select cgroup in a round-robin fashion. */
>         do {
>                 spin_lock(&zswap_pools_lock);
> -               pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
> -               memcg = pool->next_shrink;
> +               zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL);
> +               memcg = zswap.next_shrink;
>
>                 /*
>                  * We need to retry if we have gone through a full round trip, or if we
> @@ -1445,7 +1394,7 @@ static void shrink_worker(struct work_struct *w)
>                 if (!mem_cgroup_tryget_online(memcg)) {
>                         /* drop the reference from mem_cgroup_iter() */
>                         mem_cgroup_iter_break(NULL, memcg);
> -                       pool->next_shrink = NULL;
> +                       zswap.next_shrink = NULL;
>                         spin_unlock(&zswap_pools_lock);
>
>                         if (++failures == MAX_RECLAIM_RETRIES)
> @@ -1467,7 +1416,6 @@ static void shrink_worker(struct work_struct *w)
>  resched:
>                 cond_resched();
>         } while (!zswap_can_accept());
> -       zswap_pool_put(pool);
>  }
>
>  static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
> @@ -1508,7 +1456,6 @@ bool zswap_store(struct folio *folio)
>         struct zswap_entry *entry, *dupentry;
>         struct obj_cgroup *objcg = NULL;
>         struct mem_cgroup *memcg = NULL;
> -       struct zswap_pool *shrink_pool;
>
>         VM_WARN_ON_ONCE(!folio_test_locked(folio));
>         VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> @@ -1576,7 +1523,7 @@ bool zswap_store(struct folio *folio)
>
>         if (objcg) {
>                 memcg = get_mem_cgroup_from_objcg(objcg);
> -               if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) {
> +               if (memcg_list_lru_alloc(memcg, &zswap.list_lru, GFP_KERNEL)) {
>                         mem_cgroup_put(memcg);
>                         goto put_pool;
>                 }
> @@ -1607,8 +1554,8 @@ bool zswap_store(struct folio *folio)
>         }
>         if (entry->length) {
>                 INIT_LIST_HEAD(&entry->lru);
> -               zswap_lru_add(&entry->pool->list_lru, entry);
> -               atomic_inc(&entry->pool->nr_stored);
> +               zswap_lru_add(&zswap.list_lru, entry);
> +               atomic_inc(&zswap.nr_stored);
>         }
>         spin_unlock(&tree->lock);
>
> @@ -1640,9 +1587,7 @@ bool zswap_store(struct folio *folio)
>         return false;
>
>  shrink:
> -       shrink_pool = zswap_pool_last_get();
> -       if (shrink_pool && !queue_work(shrink_wq, &shrink_pool->shrink_work))
> -               zswap_pool_put(shrink_pool);
> +       queue_work(shrink_wq, &zswap.shrink_work);
>         goto reject;
>  }
>
> @@ -1804,6 +1749,21 @@ static int zswap_setup(void)
>         if (ret)
>                 goto hp_fail;
>
> +       shrink_wq = alloc_workqueue("zswap-shrink",
> +                       WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
> +       if (!shrink_wq)
> +               goto hp_fail;
> +
> +       zswap.shrinker = zswap_alloc_shrinker();
> +       if (!zswap.shrinker)
> +               goto shrinker_fail;
> +       if (list_lru_init_memcg(&zswap.list_lru, zswap.shrinker))
> +               goto lru_fail;
> +       shrinker_register(zswap.shrinker);
> +
> +       INIT_WORK(&zswap.shrink_work, shrink_worker);
> +       atomic_set(&zswap.nr_stored, 0);
> +
>         pool = __zswap_pool_create_fallback();
>         if (pool) {
>                 pr_info("loaded using pool %s/%s\n", pool->tfm_name,
> @@ -1815,19 +1775,16 @@ static int zswap_setup(void)
>                 zswap_enabled = false;
>         }
>
> -       shrink_wq = alloc_workqueue("zswap-shrink",
> -                       WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
> -       if (!shrink_wq)
> -               goto fallback_fail;
> -
>         if (zswap_debugfs_init())
>                 pr_warn("debugfs initialization failed\n");
>         zswap_init_state = ZSWAP_INIT_SUCCEED;
>         return 0;
>
> -fallback_fail:
> -       if (pool)
> -               zswap_pool_destroy(pool);
> +lru_fail:
> +       list_lru_destroy(&zswap.list_lru);
> +       shrinker_free(zswap.shrinker);
> +shrinker_fail:
> +       destroy_workqueue(shrink_wq);
>  hp_fail:
>         kmem_cache_destroy(zswap_entry_cache);
>  cache_fail:
>
> --
> b4 0.10.1
kernel test robot Feb. 11, 2024, 10:05 p.m. UTC | #3
Hi Chengming,

kernel test robot noticed the following build errors:

[auto build test ERROR on 191d97734e41a5c9f90a2f6636fdd335ae1d435d]

url:    https://github.com/intel-lab-lkp/linux/commits/Chengming-Zhou/mm-zswap-global-lru-and-shrinker-shared-by-all-zswap_pools/20240211-220028
base:   191d97734e41a5c9f90a2f6636fdd335ae1d435d
patch link:    https://lore.kernel.org/r/20240210-zswap-global-lru-v1-1-853473d7b0da%40bytedance.com
patch subject: [PATCH 1/2] mm/zswap: global lru and shrinker shared by all zswap_pools
config: i386-randconfig-054-20240212 (https://download.01.org/0day-ci/archive/20240212/202402120503.HRNkoWyq-lkp@intel.com/config)
compiler: clang version 17.0.6 (https://github.com/llvm/llvm-project 6009708b4367171ccdbf4b5905cb6a803753fe18)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240212/202402120503.HRNkoWyq-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202402120503.HRNkoWyq-lkp@intel.com/

All errors (new ones prefixed by >>):

>> mm/zswap.c:1300:35: error: use of undeclared identifier 'pool'
    1300 |         nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
         |                                          ^
   1 error generated.


vim +/pool +1300 mm/zswap.c

b5ba474f3f5187 Nhat Pham      2023-11-30  1283  
b5ba474f3f5187 Nhat Pham      2023-11-30  1284  static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
b5ba474f3f5187 Nhat Pham      2023-11-30  1285  		struct shrink_control *sc)
b5ba474f3f5187 Nhat Pham      2023-11-30  1286  {
b5ba474f3f5187 Nhat Pham      2023-11-30  1287  	struct mem_cgroup *memcg = sc->memcg;
b5ba474f3f5187 Nhat Pham      2023-11-30  1288  	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
b5ba474f3f5187 Nhat Pham      2023-11-30  1289  	unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
b5ba474f3f5187 Nhat Pham      2023-11-30  1290  
501a06fe8e4c18 Nhat Pham      2023-12-07  1291  	if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
b5ba474f3f5187 Nhat Pham      2023-11-30  1292  		return 0;
b5ba474f3f5187 Nhat Pham      2023-11-30  1293  
b5ba474f3f5187 Nhat Pham      2023-11-30  1294  #ifdef CONFIG_MEMCG_KMEM
7d7ef0a4686abe Yosry Ahmed    2023-11-29  1295  	mem_cgroup_flush_stats(memcg);
b5ba474f3f5187 Nhat Pham      2023-11-30  1296  	nr_backing = memcg_page_state(memcg, MEMCG_ZSWAP_B) >> PAGE_SHIFT;
b5ba474f3f5187 Nhat Pham      2023-11-30  1297  	nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
b5ba474f3f5187 Nhat Pham      2023-11-30  1298  #else
b5ba474f3f5187 Nhat Pham      2023-11-30  1299  	/* use pool stats instead of memcg stats */
b5ba474f3f5187 Nhat Pham      2023-11-30 @1300  	nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
bf414d6ae81ba2 Chengming Zhou 2024-02-11  1301  	nr_stored = atomic_read(&zswap.nr_stored);
b5ba474f3f5187 Nhat Pham      2023-11-30  1302  #endif
b5ba474f3f5187 Nhat Pham      2023-11-30  1303  
b5ba474f3f5187 Nhat Pham      2023-11-30  1304  	if (!nr_stored)
b5ba474f3f5187 Nhat Pham      2023-11-30  1305  		return 0;
b5ba474f3f5187 Nhat Pham      2023-11-30  1306  
b5ba474f3f5187 Nhat Pham      2023-11-30  1307  	nr_protected =
b5ba474f3f5187 Nhat Pham      2023-11-30  1308  		atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
bf414d6ae81ba2 Chengming Zhou 2024-02-11  1309  	nr_freeable = list_lru_shrink_count(&zswap.list_lru, sc);
b5ba474f3f5187 Nhat Pham      2023-11-30  1310  	/*
b5ba474f3f5187 Nhat Pham      2023-11-30  1311  	 * Subtract the lru size by an estimate of the number of pages
b5ba474f3f5187 Nhat Pham      2023-11-30  1312  	 * that should be protected.
b5ba474f3f5187 Nhat Pham      2023-11-30  1313  	 */
b5ba474f3f5187 Nhat Pham      2023-11-30  1314  	nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
b5ba474f3f5187 Nhat Pham      2023-11-30  1315  
b5ba474f3f5187 Nhat Pham      2023-11-30  1316  	/*
b5ba474f3f5187 Nhat Pham      2023-11-30  1317  	 * Scale the number of freeable pages by the memory saving factor.
b5ba474f3f5187 Nhat Pham      2023-11-30  1318  	 * This ensures that the better zswap compresses memory, the fewer
b5ba474f3f5187 Nhat Pham      2023-11-30  1319  	 * pages we will evict to swap (as it will otherwise incur IO for
b5ba474f3f5187 Nhat Pham      2023-11-30  1320  	 * relatively small memory saving).
b5ba474f3f5187 Nhat Pham      2023-11-30  1321  	 */
b5ba474f3f5187 Nhat Pham      2023-11-30  1322  	return mult_frac(nr_freeable, nr_backing, nr_stored);
b5ba474f3f5187 Nhat Pham      2023-11-30  1323  }
b5ba474f3f5187 Nhat Pham      2023-11-30  1324
Chengming Zhou Feb. 12, 2024, 1:17 p.m. UTC | #4
On 2024/2/12 03:01, kernel test robot wrote:
> Hi Chengming,
> 
> kernel test robot noticed the following build errors:
> 
> [auto build test ERROR on 191d97734e41a5c9f90a2f6636fdd335ae1d435d]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Chengming-Zhou/mm-zswap-global-lru-and-shrinker-shared-by-all-zswap_pools/20240211-220028
> base:   191d97734e41a5c9f90a2f6636fdd335ae1d435d
> patch link:    https://lore.kernel.org/r/20240210-zswap-global-lru-v1-1-853473d7b0da%40bytedance.com
> patch subject: [PATCH 1/2] mm/zswap: global lru and shrinker shared by all zswap_pools
> config: x86_64-randconfig-013-20240211 (https://download.01.org/0day-ci/archive/20240212/202402120226.TK7G37U9-lkp@intel.com/config)
> compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240212/202402120226.TK7G37U9-lkp@intel.com/reproduce)
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202402120226.TK7G37U9-lkp@intel.com/
> 
> All errors (new ones prefixed by >>):
> 
>    mm/zswap.c: In function 'zswap_shrinker_count':
>>> mm/zswap.c:1300:42: error: 'pool' undeclared (first use in this function); did you mean 'zpool'?
>     1300 |         nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;

Forgot to test with !CONFIG_MEMCG_KMEM, need to change to zswap_pool_total_size here.

Thanks.
Chengming Zhou Feb. 12, 2024, 1:20 p.m. UTC | #5
On 2024/2/12 05:04, Nhat Pham wrote:
> On Sun, Feb 11, 2024 at 5:57 AM Chengming Zhou
> <zhouchengming@bytedance.com> wrote:
>>
>> Dynamic zswap_pool creation may create/reuse to have multiple
>> zswap_pools in a list, only the first will be current used.
>>
>> Each zswap_pool has its own lru and shrinker, which is not
>> necessary and has its problem:
>>
>> 1. When memory has pressure, all shrinker of zswap_pools will
>>    try to shrink its own lru, there is no order between them.
>>
>> 2. When zswap limit hit, only the last zswap_pool's shrink_work
>>    will try to shrink its lru, which is inefficient.
>>
>> Anyway, having a global lru and shrinker shared by all zswap_pools
>> is better and efficient.
>>
>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> 
> I'll do a careful review later, but IMO this is a good idea :)

Ok, thanks, take your time. :)

> 
> Chris pointed out when he reviewed the zswap shrinker patch series
> that the reclaim algorithm has to decide which pool to reclaim from,
> and I have always thought that it was a bit weird that we have to do
> it at all. We should reclaim stored objects by access ordering,
> irregardless of which pool it belongs to. Having a shared LRU and
> other associated reclaim structures is sound, and saves a bit of space
> too while we're at it.

Right, agree!
Yosry Ahmed Feb. 13, 2024, 12:57 p.m. UTC | #6
On Sun, Feb 11, 2024 at 01:57:04PM +0000, Chengming Zhou wrote:
> Dynamic zswap_pool creation may create/reuse to have multiple
> zswap_pools in a list, only the first will be current used.
> 
> Each zswap_pool has its own lru and shrinker, which is not
> necessary and has its problem:
> 
> 1. When memory has pressure, all shrinker of zswap_pools will
>    try to shrink its own lru, there is no order between them.
> 
> 2. When zswap limit hit, only the last zswap_pool's shrink_work
>    will try to shrink its lru, which is inefficient.
> 
> Anyway, having a global lru and shrinker shared by all zswap_pools
> is better and efficient.

It is also a great simplification.

> 
> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
> ---
>  mm/zswap.c | 153 ++++++++++++++++++++++---------------------------------------
>  1 file changed, 55 insertions(+), 98 deletions(-)
> 
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 62fe307521c9..7668db8c10e3 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -176,14 +176,17 @@ struct zswap_pool {
>  	struct kref kref;
>  	struct list_head list;
>  	struct work_struct release_work;
> -	struct work_struct shrink_work;
>  	struct hlist_node node;
>  	char tfm_name[CRYPTO_MAX_ALG_NAME];
> +};
> +
> +struct {

static?

>  	struct list_lru list_lru;
> -	struct mem_cgroup *next_shrink;
> -	struct shrinker *shrinker;

Just curious, any reason to change the relative ordering of members
here? It produces a couple more lines of diff :)

>  	atomic_t nr_stored;
> -};
> +	struct shrinker *shrinker;
> +	struct work_struct shrink_work;
> +	struct mem_cgroup *next_shrink;
> +} zswap;
>  
>  /*
>   * struct zswap_entry
> @@ -301,9 +304,6 @@ static void zswap_update_total_size(void)
>  * pool functions
>  **********************************/
>  
> -static void zswap_alloc_shrinker(struct zswap_pool *pool);
> -static void shrink_worker(struct work_struct *w);
> -
>  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>  {
>  	int i;
> @@ -353,30 +353,16 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>  	if (ret)
>  		goto error;
>  
> -	zswap_alloc_shrinker(pool);
> -	if (!pool->shrinker)
> -		goto error;
> -
> -	pr_debug("using %s compressor\n", pool->tfm_name);
> -

Why are we removing this debug print?

>  	/* being the current pool takes 1 ref; this func expects the
>  	 * caller to always add the new pool as the current pool
>  	 */
>  	kref_init(&pool->kref);
>  	INIT_LIST_HEAD(&pool->list);
> -	if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
> -		goto lru_fail;
> -	shrinker_register(pool->shrinker);
> -	INIT_WORK(&pool->shrink_work, shrink_worker);
> -	atomic_set(&pool->nr_stored, 0);
>  
>  	zswap_pool_debug("created", pool);
>  
>  	return pool;
>  
> -lru_fail:
> -	list_lru_destroy(&pool->list_lru);
> -	shrinker_free(pool->shrinker);
>  error:
>  	if (pool->acomp_ctx)
>  		free_percpu(pool->acomp_ctx);
[..]
> @@ -816,14 +777,10 @@ void zswap_folio_swapin(struct folio *folio)
>  
>  void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
>  {
> -	struct zswap_pool *pool;
> -
> -	/* lock out zswap pools list modification */
> +	/* lock out zswap shrinker walking memcg tree */
>  	spin_lock(&zswap_pools_lock);
> -	list_for_each_entry(pool, &zswap_pools, list) {
> -		if (pool->next_shrink == memcg)
> -			pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
> -	}
> +	if (zswap.next_shrink == memcg)
> +		zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL);

Now that next_shrink has nothing to do with zswap pools, it feels weird
that we are using zswap_pools_lock for its synchronization. Does it make
sense to have a separate lock for it just for semantic purposes?

>  	spin_unlock(&zswap_pools_lock);
>  }
>  
[..]
> @@ -1328,7 +1284,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
>  static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
>  		struct shrink_control *sc)
>  {
> -	struct zswap_pool *pool = shrinker->private_data;
>  	struct mem_cgroup *memcg = sc->memcg;
>  	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
>  	unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
> @@ -1343,7 +1298,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
>  #else
>  	/* use pool stats instead of memcg stats */
>  	nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;

"pool" is still being used here.

> -	nr_stored = atomic_read(&pool->nr_stored);
> +	nr_stored = atomic_read(&zswap.nr_stored);
>  #endif
>  
>  	if (!nr_stored)
[..]  
> @@ -1804,6 +1749,21 @@ static int zswap_setup(void)
>  	if (ret)
>  		goto hp_fail;
>  
> +	shrink_wq = alloc_workqueue("zswap-shrink",
> +			WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
> +	if (!shrink_wq)
> +		goto hp_fail;

I think we need a new label here to call cpuhp_remove_multi_state(), but
apparently this is missing from the current code for some reason.

> +
> +	zswap.shrinker = zswap_alloc_shrinker();
> +	if (!zswap.shrinker)
> +		goto shrinker_fail;
> +	if (list_lru_init_memcg(&zswap.list_lru, zswap.shrinker))
> +		goto lru_fail;
> +	shrinker_register(zswap.shrinker);
> +
> +	INIT_WORK(&zswap.shrink_work, shrink_worker);
> +	atomic_set(&zswap.nr_stored, 0);
> +
>  	pool = __zswap_pool_create_fallback();
>  	if (pool) {
>  		pr_info("loaded using pool %s/%s\n", pool->tfm_name,
> @@ -1815,19 +1775,16 @@ static int zswap_setup(void)
>  		zswap_enabled = false;
>  	}
>  
> -	shrink_wq = alloc_workqueue("zswap-shrink",
> -			WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
> -	if (!shrink_wq)
> -		goto fallback_fail;
> -
>  	if (zswap_debugfs_init())
>  		pr_warn("debugfs initialization failed\n");
>  	zswap_init_state = ZSWAP_INIT_SUCCEED;
>  	return 0;
>  
> -fallback_fail:
> -	if (pool)
> -		zswap_pool_destroy(pool);
> +lru_fail:
> +	list_lru_destroy(&zswap.list_lru);

Do we need to call list_lru_destroy() here? I know it is currently being
called if list_lru_init_memcg() fails, but I fail to understand why. It
seems like list_lru_destroy() will do nothing anyway.

> +	shrinker_free(zswap.shrinker);
> +shrinker_fail:
> +	destroy_workqueue(shrink_wq);
>  hp_fail:
>  	kmem_cache_destroy(zswap_entry_cache);
>  cache_fail:
> 
> -- 
> b4 0.10.1
Chengming Zhou Feb. 13, 2024, 2:20 p.m. UTC | #7
On 2024/2/13 20:57, Yosry Ahmed wrote:
> On Sun, Feb 11, 2024 at 01:57:04PM +0000, Chengming Zhou wrote:
>> Dynamic zswap_pool creation may create/reuse to have multiple
>> zswap_pools in a list, only the first will be current used.
>>
>> Each zswap_pool has its own lru and shrinker, which is not
>> necessary and has its problem:
>>
>> 1. When memory has pressure, all shrinker of zswap_pools will
>>    try to shrink its own lru, there is no order between them.
>>
>> 2. When zswap limit hit, only the last zswap_pool's shrink_work
>>    will try to shrink its lru, which is inefficient.
>>
>> Anyway, having a global lru and shrinker shared by all zswap_pools
>> is better and efficient.
> 
> It is also a great simplification.
> 
>>
>> Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
>> ---
>>  mm/zswap.c | 153 ++++++++++++++++++++++---------------------------------------
>>  1 file changed, 55 insertions(+), 98 deletions(-)
>>
>> diff --git a/mm/zswap.c b/mm/zswap.c
>> index 62fe307521c9..7668db8c10e3 100644
>> --- a/mm/zswap.c
>> +++ b/mm/zswap.c
>> @@ -176,14 +176,17 @@ struct zswap_pool {
>>  	struct kref kref;
>>  	struct list_head list;
>>  	struct work_struct release_work;
>> -	struct work_struct shrink_work;
>>  	struct hlist_node node;
>>  	char tfm_name[CRYPTO_MAX_ALG_NAME];
>> +};
>> +
>> +struct {
> 
> static?

Ah, right, will add static.

> 
>>  	struct list_lru list_lru;
>> -	struct mem_cgroup *next_shrink;
>> -	struct shrinker *shrinker;
> 
> Just curious, any reason to change the relative ordering of members
> here? It produces a couple more lines of diff :)

The list_lru and nr_stored atomic variable are used in zswap_store/load
hotpath, the other shrinker related sound like cold path. I thought it's
normal and clearer to put them according to their usages.

> 
>>  	atomic_t nr_stored;
>> -};
>> +	struct shrinker *shrinker;
>> +	struct work_struct shrink_work;
>> +	struct mem_cgroup *next_shrink;
>> +} zswap;
>>  
>>  /*
>>   * struct zswap_entry
>> @@ -301,9 +304,6 @@ static void zswap_update_total_size(void)
>>  * pool functions
>>  **********************************/
>>  
>> -static void zswap_alloc_shrinker(struct zswap_pool *pool);
>> -static void shrink_worker(struct work_struct *w);
>> -
>>  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>>  {
>>  	int i;
>> @@ -353,30 +353,16 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
>>  	if (ret)
>>  		goto error;
>>  
>> -	zswap_alloc_shrinker(pool);
>> -	if (!pool->shrinker)
>> -		goto error;
>> -
>> -	pr_debug("using %s compressor\n", pool->tfm_name);
>> -
> 
> Why are we removing this debug print?

Oh, I just noticed it's only necessary to print dmesg when "create" success,
the below "zswap_pool_debug()" will print its compressor too.

> 
>>  	/* being the current pool takes 1 ref; this func expects the
>>  	 * caller to always add the new pool as the current pool
>>  	 */
>>  	kref_init(&pool->kref);
>>  	INIT_LIST_HEAD(&pool->list);
>> -	if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
>> -		goto lru_fail;
>> -	shrinker_register(pool->shrinker);
>> -	INIT_WORK(&pool->shrink_work, shrink_worker);
>> -	atomic_set(&pool->nr_stored, 0);
>>  
>>  	zswap_pool_debug("created", pool);
>>  
>>  	return pool;
>>  
>> -lru_fail:
>> -	list_lru_destroy(&pool->list_lru);
>> -	shrinker_free(pool->shrinker);
>>  error:
>>  	if (pool->acomp_ctx)
>>  		free_percpu(pool->acomp_ctx);
> [..]
>> @@ -816,14 +777,10 @@ void zswap_folio_swapin(struct folio *folio)
>>  
>>  void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
>>  {
>> -	struct zswap_pool *pool;
>> -
>> -	/* lock out zswap pools list modification */
>> +	/* lock out zswap shrinker walking memcg tree */
>>  	spin_lock(&zswap_pools_lock);
>> -	list_for_each_entry(pool, &zswap_pools, list) {
>> -		if (pool->next_shrink == memcg)
>> -			pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
>> -	}
>> +	if (zswap.next_shrink == memcg)
>> +		zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL);
> 
> Now that next_shrink has nothing to do with zswap pools, it feels weird
> that we are using zswap_pools_lock for its synchronization. Does it make
> sense to have a separate lock for it just for semantic purposes?

Agree, I think so, it's clearer to have another lock.

> 
>>  	spin_unlock(&zswap_pools_lock);
>>  }
>>  
> [..]
>> @@ -1328,7 +1284,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
>>  static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
>>  		struct shrink_control *sc)
>>  {
>> -	struct zswap_pool *pool = shrinker->private_data;
>>  	struct mem_cgroup *memcg = sc->memcg;
>>  	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
>>  	unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
>> @@ -1343,7 +1298,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
>>  #else
>>  	/* use pool stats instead of memcg stats */
>>  	nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
> 
> "pool" is still being used here.

Oops, should be changed to zswap_pool_total_size here.

> 
>> -	nr_stored = atomic_read(&pool->nr_stored);
>> +	nr_stored = atomic_read(&zswap.nr_stored);
>>  #endif
>>  
>>  	if (!nr_stored)
> [..]  
>> @@ -1804,6 +1749,21 @@ static int zswap_setup(void)
>>  	if (ret)
>>  		goto hp_fail;
>>  
>> +	shrink_wq = alloc_workqueue("zswap-shrink",
>> +			WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
>> +	if (!shrink_wq)
>> +		goto hp_fail;
> 
> I think we need a new label here to call cpuhp_remove_multi_state(), but
> apparently this is missing from the current code for some reason.

You are right! This should use a new label to "cpuhp_remove_multi_state()",
will fix it.

> 
>> +
>> +	zswap.shrinker = zswap_alloc_shrinker();
>> +	if (!zswap.shrinker)
>> +		goto shrinker_fail;
>> +	if (list_lru_init_memcg(&zswap.list_lru, zswap.shrinker))
>> +		goto lru_fail;
>> +	shrinker_register(zswap.shrinker);
>> +
>> +	INIT_WORK(&zswap.shrink_work, shrink_worker);
>> +	atomic_set(&zswap.nr_stored, 0);
>> +
>>  	pool = __zswap_pool_create_fallback();
>>  	if (pool) {
>>  		pr_info("loaded using pool %s/%s\n", pool->tfm_name,
>> @@ -1815,19 +1775,16 @@ static int zswap_setup(void)
>>  		zswap_enabled = false;
>>  	}
>>  
>> -	shrink_wq = alloc_workqueue("zswap-shrink",
>> -			WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
>> -	if (!shrink_wq)
>> -		goto fallback_fail;
>> -
>>  	if (zswap_debugfs_init())
>>  		pr_warn("debugfs initialization failed\n");
>>  	zswap_init_state = ZSWAP_INIT_SUCCEED;
>>  	return 0;
>>  
>> -fallback_fail:
>> -	if (pool)
>> -		zswap_pool_destroy(pool);
>> +lru_fail:
>> +	list_lru_destroy(&zswap.list_lru);
> 
> Do we need to call list_lru_destroy() here? I know it is currently being
> called if list_lru_init_memcg() fails, but I fail to understand why. It
> seems like list_lru_destroy() will do nothing anyway.

Right, it's not needed to call list_lru_destroy() here, it should do nothing,
will delete it.

Thanks!

> 
>> +	shrinker_free(zswap.shrinker);
>> +shrinker_fail:
>> +	destroy_workqueue(shrink_wq);
>>  hp_fail:
>>  	kmem_cache_destroy(zswap_entry_cache);
>>  cache_fail:
>>
>> -- 
>> b4 0.10.1
Yosry Ahmed Feb. 13, 2024, 5:43 p.m. UTC | #8
> >> @@ -353,30 +353,16 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> >>  	if (ret)
> >>  		goto error;
> >>  
> >> -	zswap_alloc_shrinker(pool);
> >> -	if (!pool->shrinker)
> >> -		goto error;
> >> -
> >> -	pr_debug("using %s compressor\n", pool->tfm_name);
> >> -
> > 
> > Why are we removing this debug print?

This pr_debug() was introduced when dynamic zswap pools were introduced,
and it was supposed to be printed right after the compressor is
initialized. IOW, it is supposed to be after the call to
cpuhp_state_add_instance() succeeds. The call to zswap_alloc_shrinker()
was mistakenly added above that pr_debug() call.

Anyway, I just realized you are now removing all failure cases between
than pr_debug() and the zswap_pool_debug() below, so there is no need to
keep both. You are right.

I am wondering if these debug prints are useful at all now, but that's a
question for another day :)
diff mbox series

Patch

diff --git a/mm/zswap.c b/mm/zswap.c
index 62fe307521c9..7668db8c10e3 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -176,14 +176,17 @@  struct zswap_pool {
 	struct kref kref;
 	struct list_head list;
 	struct work_struct release_work;
-	struct work_struct shrink_work;
 	struct hlist_node node;
 	char tfm_name[CRYPTO_MAX_ALG_NAME];
+};
+
+struct {
 	struct list_lru list_lru;
-	struct mem_cgroup *next_shrink;
-	struct shrinker *shrinker;
 	atomic_t nr_stored;
-};
+	struct shrinker *shrinker;
+	struct work_struct shrink_work;
+	struct mem_cgroup *next_shrink;
+} zswap;
 
 /*
  * struct zswap_entry
@@ -301,9 +304,6 @@  static void zswap_update_total_size(void)
 * pool functions
 **********************************/
 
-static void zswap_alloc_shrinker(struct zswap_pool *pool);
-static void shrink_worker(struct work_struct *w);
-
 static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 {
 	int i;
@@ -353,30 +353,16 @@  static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 	if (ret)
 		goto error;
 
-	zswap_alloc_shrinker(pool);
-	if (!pool->shrinker)
-		goto error;
-
-	pr_debug("using %s compressor\n", pool->tfm_name);
-
 	/* being the current pool takes 1 ref; this func expects the
 	 * caller to always add the new pool as the current pool
 	 */
 	kref_init(&pool->kref);
 	INIT_LIST_HEAD(&pool->list);
-	if (list_lru_init_memcg(&pool->list_lru, pool->shrinker))
-		goto lru_fail;
-	shrinker_register(pool->shrinker);
-	INIT_WORK(&pool->shrink_work, shrink_worker);
-	atomic_set(&pool->nr_stored, 0);
 
 	zswap_pool_debug("created", pool);
 
 	return pool;
 
-lru_fail:
-	list_lru_destroy(&pool->list_lru);
-	shrinker_free(pool->shrinker);
 error:
 	if (pool->acomp_ctx)
 		free_percpu(pool->acomp_ctx);
@@ -434,15 +420,8 @@  static void zswap_pool_destroy(struct zswap_pool *pool)
 
 	zswap_pool_debug("destroying", pool);
 
-	shrinker_free(pool->shrinker);
 	cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
 	free_percpu(pool->acomp_ctx);
-	list_lru_destroy(&pool->list_lru);
-
-	spin_lock(&zswap_pools_lock);
-	mem_cgroup_iter_break(NULL, pool->next_shrink);
-	pool->next_shrink = NULL;
-	spin_unlock(&zswap_pools_lock);
 
 	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
 		zpool_destroy_pool(pool->zpools[i]);
@@ -529,24 +508,6 @@  static struct zswap_pool *zswap_pool_current_get(void)
 	return pool;
 }
 
-static struct zswap_pool *zswap_pool_last_get(void)
-{
-	struct zswap_pool *pool, *last = NULL;
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(pool, &zswap_pools, list)
-		last = pool;
-	WARN_ONCE(!last && zswap_has_pool,
-		  "%s: no page storage pool!\n", __func__);
-	if (!zswap_pool_get(last))
-		last = NULL;
-
-	rcu_read_unlock();
-
-	return last;
-}
-
 /* type and compressor must be null-terminated */
 static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
 {
@@ -816,14 +777,10 @@  void zswap_folio_swapin(struct folio *folio)
 
 void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
 {
-	struct zswap_pool *pool;
-
-	/* lock out zswap pools list modification */
+	/* lock out zswap shrinker walking memcg tree */
 	spin_lock(&zswap_pools_lock);
-	list_for_each_entry(pool, &zswap_pools, list) {
-		if (pool->next_shrink == memcg)
-			pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
-	}
+	if (zswap.next_shrink == memcg)
+		zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL);
 	spin_unlock(&zswap_pools_lock);
 }
 
@@ -923,9 +880,9 @@  static void zswap_entry_free(struct zswap_entry *entry)
 	if (!entry->length)
 		atomic_dec(&zswap_same_filled_pages);
 	else {
-		zswap_lru_del(&entry->pool->list_lru, entry);
+		zswap_lru_del(&zswap.list_lru, entry);
 		zpool_free(zswap_find_zpool(entry), entry->handle);
-		atomic_dec(&entry->pool->nr_stored);
+		atomic_dec(&zswap.nr_stored);
 		zswap_pool_put(entry->pool);
 	}
 	if (entry->objcg) {
@@ -1288,7 +1245,6 @@  static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
 {
 	struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
 	unsigned long shrink_ret, nr_protected, lru_size;
-	struct zswap_pool *pool = shrinker->private_data;
 	bool encountered_page_in_swapcache = false;
 
 	if (!zswap_shrinker_enabled ||
@@ -1299,7 +1255,7 @@  static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
 
 	nr_protected =
 		atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
-	lru_size = list_lru_shrink_count(&pool->list_lru, sc);
+	lru_size = list_lru_shrink_count(&zswap.list_lru, sc);
 
 	/*
 	 * Abort if we are shrinking into the protected region.
@@ -1316,7 +1272,7 @@  static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
 		return SHRINK_STOP;
 	}
 
-	shrink_ret = list_lru_shrink_walk(&pool->list_lru, sc, &shrink_memcg_cb,
+	shrink_ret = list_lru_shrink_walk(&zswap.list_lru, sc, &shrink_memcg_cb,
 		&encountered_page_in_swapcache);
 
 	if (encountered_page_in_swapcache)
@@ -1328,7 +1284,6 @@  static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
 static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
 		struct shrink_control *sc)
 {
-	struct zswap_pool *pool = shrinker->private_data;
 	struct mem_cgroup *memcg = sc->memcg;
 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
 	unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
@@ -1343,7 +1298,7 @@  static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
 #else
 	/* use pool stats instead of memcg stats */
 	nr_backing = get_zswap_pool_size(pool) >> PAGE_SHIFT;
-	nr_stored = atomic_read(&pool->nr_stored);
+	nr_stored = atomic_read(&zswap.nr_stored);
 #endif
 
 	if (!nr_stored)
@@ -1351,7 +1306,7 @@  static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
 
 	nr_protected =
 		atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
-	nr_freeable = list_lru_shrink_count(&pool->list_lru, sc);
+	nr_freeable = list_lru_shrink_count(&zswap.list_lru, sc);
 	/*
 	 * Subtract the lru size by an estimate of the number of pages
 	 * that should be protected.
@@ -1367,23 +1322,24 @@  static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
 	return mult_frac(nr_freeable, nr_backing, nr_stored);
 }
 
-static void zswap_alloc_shrinker(struct zswap_pool *pool)
+static struct shrinker *zswap_alloc_shrinker(void)
 {
-	pool->shrinker =
+	struct shrinker *shrinker;
+
+	shrinker =
 		shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "mm-zswap");
-	if (!pool->shrinker)
-		return;
+	if (!shrinker)
+		return NULL;
 
-	pool->shrinker->private_data = pool;
-	pool->shrinker->scan_objects = zswap_shrinker_scan;
-	pool->shrinker->count_objects = zswap_shrinker_count;
-	pool->shrinker->batch = 0;
-	pool->shrinker->seeks = DEFAULT_SEEKS;
+	shrinker->scan_objects = zswap_shrinker_scan;
+	shrinker->count_objects = zswap_shrinker_count;
+	shrinker->batch = 0;
+	shrinker->seeks = DEFAULT_SEEKS;
+	return shrinker;
 }
 
 static int shrink_memcg(struct mem_cgroup *memcg)
 {
-	struct zswap_pool *pool;
 	int nid, shrunk = 0;
 
 	if (!mem_cgroup_zswap_writeback_enabled(memcg))
@@ -1396,32 +1352,25 @@  static int shrink_memcg(struct mem_cgroup *memcg)
 	if (memcg && !mem_cgroup_online(memcg))
 		return -ENOENT;
 
-	pool = zswap_pool_current_get();
-	if (!pool)
-		return -EINVAL;
-
 	for_each_node_state(nid, N_NORMAL_MEMORY) {
 		unsigned long nr_to_walk = 1;
 
-		shrunk += list_lru_walk_one(&pool->list_lru, nid, memcg,
+		shrunk += list_lru_walk_one(&zswap.list_lru, nid, memcg,
 					    &shrink_memcg_cb, NULL, &nr_to_walk);
 	}
-	zswap_pool_put(pool);
 	return shrunk ? 0 : -EAGAIN;
 }
 
 static void shrink_worker(struct work_struct *w)
 {
-	struct zswap_pool *pool = container_of(w, typeof(*pool),
-						shrink_work);
 	struct mem_cgroup *memcg;
 	int ret, failures = 0;
 
 	/* global reclaim will select cgroup in a round-robin fashion. */
 	do {
 		spin_lock(&zswap_pools_lock);
-		pool->next_shrink = mem_cgroup_iter(NULL, pool->next_shrink, NULL);
-		memcg = pool->next_shrink;
+		zswap.next_shrink = mem_cgroup_iter(NULL, zswap.next_shrink, NULL);
+		memcg = zswap.next_shrink;
 
 		/*
 		 * We need to retry if we have gone through a full round trip, or if we
@@ -1445,7 +1394,7 @@  static void shrink_worker(struct work_struct *w)
 		if (!mem_cgroup_tryget_online(memcg)) {
 			/* drop the reference from mem_cgroup_iter() */
 			mem_cgroup_iter_break(NULL, memcg);
-			pool->next_shrink = NULL;
+			zswap.next_shrink = NULL;
 			spin_unlock(&zswap_pools_lock);
 
 			if (++failures == MAX_RECLAIM_RETRIES)
@@ -1467,7 +1416,6 @@  static void shrink_worker(struct work_struct *w)
 resched:
 		cond_resched();
 	} while (!zswap_can_accept());
-	zswap_pool_put(pool);
 }
 
 static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
@@ -1508,7 +1456,6 @@  bool zswap_store(struct folio *folio)
 	struct zswap_entry *entry, *dupentry;
 	struct obj_cgroup *objcg = NULL;
 	struct mem_cgroup *memcg = NULL;
-	struct zswap_pool *shrink_pool;
 
 	VM_WARN_ON_ONCE(!folio_test_locked(folio));
 	VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1576,7 +1523,7 @@  bool zswap_store(struct folio *folio)
 
 	if (objcg) {
 		memcg = get_mem_cgroup_from_objcg(objcg);
-		if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) {
+		if (memcg_list_lru_alloc(memcg, &zswap.list_lru, GFP_KERNEL)) {
 			mem_cgroup_put(memcg);
 			goto put_pool;
 		}
@@ -1607,8 +1554,8 @@  bool zswap_store(struct folio *folio)
 	}
 	if (entry->length) {
 		INIT_LIST_HEAD(&entry->lru);
-		zswap_lru_add(&entry->pool->list_lru, entry);
-		atomic_inc(&entry->pool->nr_stored);
+		zswap_lru_add(&zswap.list_lru, entry);
+		atomic_inc(&zswap.nr_stored);
 	}
 	spin_unlock(&tree->lock);
 
@@ -1640,9 +1587,7 @@  bool zswap_store(struct folio *folio)
 	return false;
 
 shrink:
-	shrink_pool = zswap_pool_last_get();
-	if (shrink_pool && !queue_work(shrink_wq, &shrink_pool->shrink_work))
-		zswap_pool_put(shrink_pool);
+	queue_work(shrink_wq, &zswap.shrink_work);
 	goto reject;
 }
 
@@ -1804,6 +1749,21 @@  static int zswap_setup(void)
 	if (ret)
 		goto hp_fail;
 
+	shrink_wq = alloc_workqueue("zswap-shrink",
+			WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+	if (!shrink_wq)
+		goto hp_fail;
+
+	zswap.shrinker = zswap_alloc_shrinker();
+	if (!zswap.shrinker)
+		goto shrinker_fail;
+	if (list_lru_init_memcg(&zswap.list_lru, zswap.shrinker))
+		goto lru_fail;
+	shrinker_register(zswap.shrinker);
+
+	INIT_WORK(&zswap.shrink_work, shrink_worker);
+	atomic_set(&zswap.nr_stored, 0);
+
 	pool = __zswap_pool_create_fallback();
 	if (pool) {
 		pr_info("loaded using pool %s/%s\n", pool->tfm_name,
@@ -1815,19 +1775,16 @@  static int zswap_setup(void)
 		zswap_enabled = false;
 	}
 
-	shrink_wq = alloc_workqueue("zswap-shrink",
-			WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
-	if (!shrink_wq)
-		goto fallback_fail;
-
 	if (zswap_debugfs_init())
 		pr_warn("debugfs initialization failed\n");
 	zswap_init_state = ZSWAP_INIT_SUCCEED;
 	return 0;
 
-fallback_fail:
-	if (pool)
-		zswap_pool_destroy(pool);
+lru_fail:
+	list_lru_destroy(&zswap.list_lru);
+	shrinker_free(zswap.shrinker);
+shrinker_fail:
+	destroy_workqueue(shrink_wq);
 hp_fail:
 	kmem_cache_destroy(zswap_entry_cache);
 cache_fail: