[057/227] mm: list_lru: replace linear array with xarray

Message ID	20220322214126.1F5EDC340EC@smtp.kernel.org (mailing list archive)
State	New
Headers	show Return-Path: <owner-linux-mm@kvack.org> Date: Tue, 22 Mar 2022 14:41:25 -0700 To: zhengqi.arch@bytedance.com,willy@infradead.org,vdavydov.dev@gmail.com,vbabka@suse.cz,tytso@mit.edu,trond.myklebust@hammerspace.com,shy828301@gmail.com,shakeelb@google.com,roman.gushchin@linux.dev,richard.weiyang@gmail.com,mhocko@kernel.org,kari.argillander@gmail.com,jaegeuk@kernel.org,hannes@cmpxchg.org,fam.zheng@bytedance.com,duanxiongchun@bytedance.com,david@fromorbit.com,chao@kernel.org,Anna.Schumaker@Netapp.com,alexs@kernel.org,songmuchun@bytedance.com,akpm@linux-foundation.org,patches@lists.linux.dev,linux-mm@kvack.org,mm-commits@vger.kernel.org,torvalds@linux-foundation.org,akpm@linux-foundation.org From: Andrew Morton <akpm@linux-foundation.org> In-Reply-To: <20220322143803.04a5e59a07e48284f196a2f9@linux-foundation.org> Subject: [patch 057/227] mm: list_lru: replace linear array with xarray Message-Id: <20220322214126.1F5EDC340EC@smtp.kernel.org> Sender: owner-linux-mm@kvack.org Precedence: bulk
Series	[001/227] linux/kthread.h: remove unused macros \| expand [001/227] linux/kthread.h: remove unused macros [002/227] scripts/spelling.txt: add more spellings to spelling.txt [003/227] ntfs: add sanity check on allocation size [004/227] ocfs2: cleanup some return variables [005/227] fs/ocfs2: fix comments mentioning i_mutex [006/227] doc: convert 'subsection' to 'section' in gfp.h [007/227] mm: document and polish read-ahead code [008/227] mm: improve cleanup when ->readpages doesn't process all pages [009/227] fuse: remove reliance on bdi congestion [010/227] nfs: remove reliance on bdi congestion [011/227] ceph: remove reliance on bdi congestion [012/227] remove inode_congested() [013/227] remove bdi_congested() and wb_congested() and related functions [014/227] f2fs: replace congestion_wait() calls with io_schedule_timeout() [015/227] block/bfq-iosched.c: use "false" rather than "BLK_RW_ASYNC" [016/227] remove congestion tracking framework [017/227] mount: warn only once about timestamp range expiration [018/227] mm/memremap: avoid calling kasan_remove_zero_shadow() for device private memory [019/227] filemap: remove find_get_pages() [020/227] mm/writeback: minor clean up for highmem_dirtyable_memory [021/227] mm: fs: fix lru_cache_disabled race in bh_lru [022/227] mm: fix invalid page pointer returned with FOLL_PIN gups [023/227] mm/gup: follow_pfn_pte(): -EEXIST cleanup [024/227] mm/gup: remove unused pin_user_pages_locked() [025/227] mm: change lookup_node() to use get_user_pages_fast() [026/227] mm/gup: remove unused get_user_pages_locked() [027/227] mm/swap: fix confusing comment in folio_mark_accessed [028/227] tmpfs: support for file creation time [029/227] shmem: mapping_set_exiting() to help mapped resilience [030/227] tmpfs: do not allocate pages on read [031/227] mm: shmem: use helper macro __ATTR_RW [032/227] memcg: replace in_interrupt() with !in_task() [033/227] memcg: add per-memcg total kernel memory stat [034/227] mm/memcg: mem_cgroup_per_node is already set to 0 on allocation [035/227] mm/memcg: retrieve parent memcg from css.parent [036/227] memcg: refactor mem_cgroup_oom [037/227] memcg: unify force charging conditions [038/227] selftests: memcg: test high limit for single entry allocation [039/227] memcg: synchronously enforce memory.high for large overcharges [040/227] mm/memcontrol: return 1 from cgroup.memory __setup() handler [041/227] mm/memcg: revert ("mm/memcg: optimize user context object stock access") [043/227] mm/memcg: protect per-CPU counter by disabling preemption on PREEMPT_RT where needed. [044/227] mm/memcg: opencode the inner part of obj_cgroup_uncharge_pages() in drain_obj_stock() [045/227] mm/memcg: protect memcg_stock with a local_lock_t [047/227] mm: list_lru: transpose the array of per-node per-memcg lru lists [048/227] mm: introduce kmem_cache_alloc_lru [049/227] fs: introduce alloc_inode_sb() to allocate filesystems specific inode [050/227] fs: allocate inode by using alloc_inode_sb() [051/227] f2fs: allocate inode by using alloc_inode_sb() [052/227] mm: dcache: use kmem_cache_alloc_lru() to allocate dentry [053/227] xarray: use kmem_cache_alloc_lru to allocate xa_node [054/227] mm: memcontrol: move memcg_online_kmem() to mem_cgroup_css_online() [055/227] mm: list_lru: allocate list_lru_one only when needed [056/227] mm: list_lru: rename memcg_drain_all_list_lrus to memcg_reparent_list_lrus [057/227] mm: list_lru: replace linear array with xarray [058/227] mm: memcontrol: reuse memory cgroup ID for kmem ID [059/227] mm: memcontrol: fix cannot alloc the maximum memcg ID [060/227] mm: list_lru: rename list_lru_per_memcg to list_lru_memcg [061/227] mm: memcontrol: rename memcg_cache_id to memcg_kmem_id [062/227] memcg: enable accounting for tty-related objects [063/227] selftests, x86: fix how check_cc.sh is being invoked [064/227] mm: merge pte_mkhuge() call into arch_make_huge_pte() [065/227] mm: remove mmu_gathers storage from remaining architectures [066/227] mm: thp: fix wrong cache flush in remove_migration_pmd() [067/227] mm: fix missing cache flush for all tail pages of compound page [068/227] mm: hugetlb: fix missing cache flush in copy_huge_page_from_user() [069/227] mm: hugetlb: fix missing cache flush in hugetlb_mcopy_atomic_pte() [070/227] mm: shmem: fix missing cache flush in shmem_mfill_atomic_pte() [071/227] mm: userfaultfd: fix missing cache flush in mcopy_atomic_pte() and __mcopy_atomic() [072/227] mm: replace multiple dcache flush with flush_dcache_folio() [073/227] mm: don't skip swap entry even if zap_details specified [074/227] mm: rename zap_skip_check_mapping() to should_zap_page() [075/227] mm: change zap_details.zap_mapping into even_cows [076/227] mm: rework swap handling of zap_pte_range [077/227] mm/mmap: return 1 from stack_guard_gap __setup() handler [078/227] mm/memory.c: use helper function range_in_vma() [079/227] mm/memory.c: use helper macro min and max in unmap_mapping_range_tree() [080/227] mm: _install_special_mapping() apply VM_LOCKED_CLEAR_MASK [081/227] mm/mmap: remove obsolete comment in ksys_mmap_pgoff [082/227] mm/mremap:: use vma_lookup() instead of find_vma() [083/227] mm/sparse: make mminit_validate_memmodel_limits() static [084/227] mm/vmalloc: remove unneeded function forward declaration [085/227] mm/vmalloc: Move draining areas out of caller context [086/227] mm/vmalloc: add adjust_search_size parameter [087/227] mm/vmalloc: eliminate an extra orig_gfp_mask [088/227] mm/vmalloc.c: fix "unused function" warning [089/227] mm/vmalloc: fix comments about vmap_area struct [090/227] mm: page_alloc: avoid merging non-fallbackable pageblocks with others [091/227] mm/mmzone.c: use try_cmpxchg() in page_cpupid_xchg_last() [092/227] mm/mmzone.h: remove unused macros [093/227] mm/page_alloc: don't pass pfn to free_unref_page_commit() [094/227] cma: factor out minimum alignment requirement [095/227] mm: enforce pageblock_order < MAX_ORDER [096/227] mm/page_alloc: mark pagesets as __maybe_unused [097/227] mm/pages_alloc.c: don't create ZONE_MOVABLE beyond the end of a node [098/227] mm/page_alloc: fetch the correct pcp buddy during bulk free [099/227] mm/page_alloc: track range of active PCP lists during bulk free [100/227] mm/page_alloc: simplify how many pages are selected per pcp list during bulk free [101/227] mm/page_alloc: drain the requested list first during bulk free [102/227] mm/page_alloc: free pages in a single pass during bulk free [103/227] mm/page_alloc: limit number of high-order pages on PCP during bulk free [104/227] mm/page_alloc: do not prefetch buddies during bulk free [105/227] arch/x86/mm/numa: Do not initialize nodes twice [106/227] mm: count time in drain_all_pages during direct reclaim as memory pressure [107/227] mm/page_alloc: call check_new_pages() while zone spinlock is not held [108/227] mm/page_alloc: check high-order pages for corruption during PCP operations [109/227] mm/memory-failure.c: remove obsolete comment [110/227] mm/hwpoison: fix error page recovered but reported "not recovered" [111/227] mm: invalidate hwpoison page cache page in fault path [112/227] mm/memory-failure.c: minor clean up for memory_failure_dev_pagemap [113/227] mm/memory-failure.c: catch unexpected -EFAULT from vma_address() [114/227] mm/memory-failure.c: rework the signaling logic in kill_proc [115/227] mm/memory-failure.c: fix race with changing page more robustly [116/227] mm/memory-failure.c: remove PageSlab check in hwpoison_filter_dev [117/227] mm/memory-failure.c: rework the try_to_unmap logic in hwpoison_user_mappings() [118/227] mm/memory-failure.c: remove obsolete comment in __soft_offline_page [119/227] mm/memory-failure.c: remove unnecessary PageTransTail check [120/227] mm/hwpoison-inject: support injecting hwpoison to free page [121/227] mm/hwpoison: avoid the impact of hwpoison_filter() return value on mce handler [122/227] mm/hwpoison: add in-use hugepage hwpoison filter judgement [123/227] mm/memory-failure.c: fix race with changing page compound again [124/227] mm/memory-failure.c: avoid calling invalidate_inode_page() with unexpected pages [125/227] mm/memory-failure.c: make non-LRU movable pages unhandlable [126/227] mm, fault-injection: declare should_fail_alloc_page() [127/227] mm/mlock: fix potential imbalanced rlimit ucounts adjustment [128/227] mm: hugetlb: free the 2nd vmemmap page associated with each HugeTLB page [129/227] mm: hugetlb: replace hugetlb_free_vmemmap_enabled with a static_key [130/227] mm: sparsemem: use page table lock to protect kernel pmd operations [131/227] selftests: vm: add a hugetlb test case [132/227] mm: sparsemem: move vmemmap related to HugeTLB to CONFIG_HUGETLB_PAGE_FREE_VMEMMAP [133/227] mm/hugetlb: generalize ARCH_WANT_GENERAL_HUGETLB [134/227] hugetlb: clean up potential spectre issue warnings [135/227] mm/hugetlb: use helper macro __ATTR_RW [136/227] mm/hugetlb.c: export PageHeadHuge() [137/227] mm: remove unneeded local variable follflags [138/227] userfaultfd: provide unmasked address on page-fault [139/227] userfaultfd/selftests: fix uninitialized_var.cocci warning [140/227] mm/fs: delete PF_SWAPWRITE [141/227] mm: __isolate_lru_page_prepare() in isolate_migratepages_block() [142/227] mm/list_lru: optimize memcg_reparent_list_lru_node() [143/227] mm: lru_cache_disable: replace work queue synchronization with synchronize_rcu [144/227] mm: workingset: replace IRQ-off check with a lockdep assert. [145/227] mm: vmscan: fix documentation for page_check_references() [146/227] mm: compaction: cleanup the compaction trace events [147/227] mempolicy: mbind_range() set_policy() after vma_merge() [148/227] mm/oom_kill: remove unneeded is_memcg_oom check [149/227] mm,migrate: fix establishing demotion target [150/227] mm/migrate: fix race between lock page and clear PG_Isolated [151/227] mm/thp: refix __split_huge_pmd_locked() for migration PMD [152/227] mm/cma: provide option to opt out from exposing pages on activation failure [153/227] powerpc/fadump: opt out from freeing pages on cma activation failure [154/227] NUMA Balancing: add page promotion counter [155/227] NUMA balancing: optimize page placement for memory tiering system [156/227] memory tiering: skip to scan fast memory [157/227] mm: page_io: fix psi memory pressure error on cold swapins [158/227] mm/vmstat: add event for ksm swapping in copy [159/227] mm/ksm: use helper macro __ATTR_RW [160/227] mm/hwpoison: check the subpage, not the head page [161/227] mm/madvise: use vma_lookup() instead of find_vma() [162/227] mm: madvise: return correct bytes advised with process_madvise [163/227] mm: madvise: skip unmapped vma holes passed to process_madvise [164/227] mm, memory_hotplug: make arch_alloc_nodedata independent on CONFIG_MEMORY_HOTPLUG [165/227] mm: handle uninitialized numa nodes gracefully [166/227] mm, memory_hotplug: drop arch_free_nodedata [167/227] mm, memory_hotplug: reorganize new pgdat initialization [168/227] mm: make free_area_init_node aware of memory less nodes [169/227] memcg: do not tweak node in alloc_mem_cgroup_per_node_info [170/227] drivers/base/memory: add memory block to memory group after registration succeeded [171/227] drivers/base/node: consolidate node device subsystem initialization in node_dev_init() [172/227] mm/memory_hotplug: remove obsolete comment of __add_pages [173/227] mm/memory_hotplug: avoid calling zone_intersects() for ZONE_NORMAL [174/227] mm/memory_hotplug: clean up try_offline_node [175/227] mm/memory_hotplug: fix misplaced comment in offline_pages [176/227] drivers/base/node: rename link_mem_sections() to register_memory_block_under_node() [177/227] drivers/base/memory: determine and store zone for single-zone memory blocks [178/227] drivers/base/memory: clarify adding and removing of memory blocks [179/227] mm: only re-generate demotion targets when a numa node changes its N_CPU state [180/227] mm/thp: ClearPageDoubleMap in first page_add_file_rmap() [181/227] mm/zswap.c: allow handling just same-value filled pages [182/227] mm: remove usercopy_warn() [183/227] mm: uninline copy_overflow() [184/227] mm/usercopy: return 1 from hardened_usercopy __setup() handler [185/227] mm/early_ioremap: declare early_memremap_pgprot_adjust() [186/227] highmem: document kunmap_local() [187/227] mm/highmem: remove unnecessary done label [188/227] mm/page_table_check.c: use strtobool for param parsing [189/227] mm/kfence: remove unnecessary CONFIG_KFENCE option [190/227] kfence: allow re-enabling KFENCE after system startup [191/227] kfence: alloc kfence_pool after system startup [192/227] kunit: fix UAF when run kfence test case test_gfpzero [193/227] kunit: make kunit_test_timeout compatible with comment [194/227] kfence: test: try to avoid test_gfpzero trigger rcu_stall [195/227] kfence: allow use of a deferrable timer [196/227] mm/hmm.c: remove unneeded local variable ret [197/227] mm/damon/dbgfs/init_regions: use target index instead of target id [198/227] Docs/admin-guide/mm/damon/usage: update for changed initail_regions file input [199/227] mm/damon/core: move damon_set_targets() into dbgfs [200/227] mm/damon: remove the target id concept [201/227] mm/damon: remove redundant page validation [202/227] mm/damon: rename damon_primitives to damon_operations [203/227] mm/damon: let monitoring operations can be registered and selected [204/227] mm/damon/paddr,vaddr: register themselves to DAMON in subsys_initcall [205/227] mm/damon/reclaim: use damon_select_ops() instead of damon_{v,p}a_set_operations() [206/227] mm/damon/dbgfs: use damon_select_ops() instead of damon_{v,p}a_set_operations() [207/227] mm/damon/dbgfs: use operations id for knowing if the target has pid [208/227] mm/damon/dbgfs-test: fix is_target_id() change [209/227] mm/damon/paddr,vaddr: remove damon_{p,v}a_{target_valid,set_operations}() [210/227] mm/damon: remove unnecessary CONFIG_DAMON option [211/227] Docs/vm/damon: call low level monitoring primitives the operations [212/227] Docs/vm/damon/design: update DAMON-Idle Page Tracking interference handling [213/227] Docs/damon: update outdated term 'regions update interval' [214/227] mm/damon/core: allow non-exclusive DAMON start/stop [215/227] mm/damon/core: add number of each enum type values [216/227] mm/damon: implement a minimal stub for sysfs-based DAMON interface [217/227] mm/damon/sysfs: link DAMON for virtual address spaces monitoring [218/227] mm/damon/sysfs: support the physical address space monitoring [219/227] mm/damon/sysfs: support DAMON-based Operation Schemes [220/227] mm/damon/sysfs: support DAMOS quotas [221/227] mm/damon/sysfs: support schemes prioritization [222/227] mm/damon/sysfs: support DAMOS watermarks [223/227] mm/damon/sysfs: support DAMOS stats [224/227] selftests/damon: add a test for DAMON sysfs interface [225/227] Docs/admin-guide/mm/damon/usage: document DAMON sysfs interface [226/227] Docs/ABI/testing: add DAMON sysfs interface ABI document [227/227] mm/damon/sysfs: remove repeat container_of() in damon_sysfs_kdamond_release()

Message ID

20220322214126.1F5EDC340EC@smtp.kernel.org (mailing list archive)

State

New

Headers

Date: Tue, 22 Mar 2022 14:41:25 -0700
To: 
 zhengqi.arch@bytedance.com,willy@infradead.org,vdavydov.dev@gmail.com,vbabka@suse.cz,tytso@mit.edu,trond.myklebust@hammerspace.com,shy828301@gmail.com,shakeelb@google.com,roman.gushchin@linux.dev,richard.weiyang@gmail.com,mhocko@kernel.org,kari.argillander@gmail.com,jaegeuk@kernel.org,hannes@cmpxchg.org,fam.zheng@bytedance.com,duanxiongchun@bytedance.com,david@fromorbit.com,chao@kernel.org,Anna.Schumaker@Netapp.com,alexs@kernel.org,songmuchun@bytedance.com,akpm@linux-foundation.org,patches@lists.linux.dev,linux-mm@kvack.org,mm-commits@vger.kernel.org,torvalds@linux-foundation.org,akpm@linux-foundation.org
From: Andrew Morton <akpm@linux-foundation.org>
In-Reply-To: <20220322143803.04a5e59a07e48284f196a2f9@linux-foundation.org>
Subject: [patch 057/227] mm: list_lru: replace linear array with xarray
Message-Id: <20220322214126.1F5EDC340EC@smtp.kernel.org>
Sender: owner-linux-mm@kvack.org
Precedence: bulk

Series

[001/227] linux/kthread.h: remove unused macros | expand

Commit Message

Andrew Morton March 22, 2022, 9:41 p.m. UTC

From: Muchun Song <songmuchun@bytedance.com>
Subject: mm: list_lru: replace linear array with xarray

If we run 10k containers in the system, the size of the
list_lru_memcg->lrus can be ~96KB per list_lru.  When we decrease the
number containers, the size of the array will not be shrinked.  It is not
scalable.  The xarray is a good choice for this case.  We can save a lot
of memory when there are tens of thousands continers in the system.  If we
use xarray, we also can remove the logic code of resizing array, which can
simplify the code.

[akpm@linux-foundation.org: remove unused local]
Link: https://lkml.kernel.org/r/20220228122126.37293-13-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Alex Shi <alexs@kernel.org>
Cc: Anna Schumaker <Anna.Schumaker@Netapp.com>
Cc: Chao Yu <chao@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Jaegeuk Kim <jaegeuk@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kari Argillander <kari.argillander@gmail.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Trond Myklebust <trond.myklebust@hammerspace.com>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 include/linux/list_lru.h   |   13 --
 include/linux/memcontrol.h |   23 ---
 mm/list_lru.c              |  203 +++++++++++------------------------
 mm/memcontrol.c            |   77 -------------
 4 files changed, 73 insertions(+), 243 deletions(-)

--- a/include/linux/list_lru.h~mm-list_lru-replace-linear-array-with-xarray
+++ a/include/linux/list_lru.h
@@ -11,6 +11,7 @@ 
 #include <linux/list.h>
 #include <linux/nodemask.h>
 #include <linux/shrinker.h>
+#include <linux/xarray.h>
 
 struct mem_cgroup;
 
@@ -37,12 +38,6 @@  struct list_lru_per_memcg {
 	struct list_lru_one	node[];
 };
 
-struct list_lru_memcg {
-	struct rcu_head			rcu;
-	/* array of per cgroup lists, indexed by memcg_cache_id */
-	struct list_lru_per_memcg __rcu	*mlru[];
-};
-
 struct list_lru_node {
 	/* protects all lists on the node, including per cgroup */
 	spinlock_t		lock;
@@ -57,10 +52,7 @@  struct list_lru {
 	struct list_head	list;
 	int			shrinker_id;
 	bool			memcg_aware;
-	/* protects ->mlrus->mlru[i] */
-	spinlock_t		lock;
-	/* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
-	struct list_lru_memcg	__rcu *mlrus;
+	struct xarray		xa;
 #endif
 };
 
@@ -77,7 +69,6 @@  int __list_lru_init(struct list_lru *lru
 
 int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
 			 gfp_t gfp);
-int memcg_update_all_list_lrus(int num_memcgs);
 void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent);
 
 /**
--- a/include/linux/memcontrol.h~mm-list_lru-replace-linear-array-with-xarray
+++ a/include/linux/memcontrol.h
@@ -1685,18 +1685,6 @@  void obj_cgroup_uncharge(struct obj_cgro
 
 extern struct static_key_false memcg_kmem_enabled_key;
 
-extern int memcg_nr_cache_ids;
-void memcg_get_cache_ids(void);
-void memcg_put_cache_ids(void);
-
-/*
- * Helper macro to loop through all memcg-specific caches. Callers must still
- * check if the cache is valid (it is either valid or NULL).
- * the slab_mutex must be held when looping through those caches
- */
-#define for_each_memcg_cache_index(_idx)	\
-	for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++)
-
 static inline bool memcg_kmem_enabled(void)
 {
 	return static_branch_likely(&memcg_kmem_enabled_key);
@@ -1753,9 +1741,6 @@  static inline void __memcg_kmem_uncharge
 {
 }
 
-#define for_each_memcg_cache_index(_idx)	\
-	for (; NULL; )
-
 static inline bool memcg_kmem_enabled(void)
 {
 	return false;
@@ -1766,14 +1751,6 @@  static inline int memcg_cache_id(struct
 	return -1;
 }
 
-static inline void memcg_get_cache_ids(void)
-{
-}
-
-static inline void memcg_put_cache_ids(void)
-{
-}
-
 static inline struct mem_cgroup *mem_cgroup_from_obj(void *p)
 {
        return NULL;
--- a/mm/list_lru.c~mm-list_lru-replace-linear-array-with-xarray
+++ a/mm/list_lru.c
@@ -52,21 +52,12 @@  static int lru_shrinker_id(struct list_l
 static inline struct list_lru_one *
 list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
 {
-	struct list_lru_memcg *mlrus;
-	struct list_lru_node *nlru = &lru->node[nid];
-
-	/*
-	 * Either lock or RCU protects the array of per cgroup lists
-	 * from relocation (see memcg_update_list_lru).
-	 */
-	mlrus = rcu_dereference_check(lru->mlrus, lockdep_is_held(&nlru->lock));
-	if (mlrus && idx >= 0) {
-		struct list_lru_per_memcg *mlru;
+	if (list_lru_memcg_aware(lru) && idx >= 0) {
+		struct list_lru_per_memcg *mlru = xa_load(&lru->xa, idx);
 
-		mlru = rcu_dereference_check(mlrus->mlru[idx], true);
 		return mlru ? &mlru->node[nid] : NULL;
 	}
-	return &nlru->lru;
+	return &lru->node[nid].lru;
 }
 
 static inline struct list_lru_one *
@@ -77,7 +68,7 @@  list_lru_from_kmem(struct list_lru *lru,
 	struct list_lru_one *l = &nlru->lru;
 	struct mem_cgroup *memcg = NULL;
 
-	if (!lru->mlrus)
+	if (!list_lru_memcg_aware(lru))
 		goto out;
 
 	memcg = mem_cgroup_from_obj(ptr);
@@ -309,16 +300,20 @@  unsigned long list_lru_walk_node(struct
 				 unsigned long *nr_to_walk)
 {
 	long isolated = 0;
-	int memcg_idx;
 
 	isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
 				      nr_to_walk);
+
+#ifdef CONFIG_MEMCG_KMEM
 	if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
-		for_each_memcg_cache_index(memcg_idx) {
+		struct list_lru_per_memcg *mlru;
+		unsigned long index;
+
+		xa_for_each(&lru->xa, index, mlru) {
 			struct list_lru_node *nlru = &lru->node[nid];
 
 			spin_lock(&nlru->lock);
-			isolated += __list_lru_walk_one(lru, nid, memcg_idx,
+			isolated += __list_lru_walk_one(lru, nid, index,
 							isolate, cb_arg,
 							nr_to_walk);
 			spin_unlock(&nlru->lock);
@@ -327,6 +322,8 @@  unsigned long list_lru_walk_node(struct
 				break;
 		}
 	}
+#endif
+
 	return isolated;
 }
 EXPORT_SYMBOL_GPL(list_lru_walk_node);
@@ -338,15 +335,6 @@  static void init_one_lru(struct list_lru
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-static void memcg_destroy_list_lru_range(struct list_lru_memcg *mlrus,
-					 int begin, int end)
-{
-	int i;
-
-	for (i = begin; i < end; i++)
-		kfree(mlrus->mlru[i]);
-}
-
 static struct list_lru_per_memcg *memcg_init_list_lru_one(gfp_t gfp)
 {
 	int nid;
@@ -364,14 +352,7 @@  static struct list_lru_per_memcg *memcg_
 
 static void memcg_list_lru_free(struct list_lru *lru, int src_idx)
 {
-	struct list_lru_memcg *mlrus;
-	struct list_lru_per_memcg *mlru;
-
-	spin_lock_irq(&lru->lock);
-	mlrus = rcu_dereference_protected(lru->mlrus, true);
-	mlru = rcu_dereference_protected(mlrus->mlru[src_idx], true);
-	rcu_assign_pointer(mlrus->mlru[src_idx], NULL);
-	spin_unlock_irq(&lru->lock);
+	struct list_lru_per_memcg *mlru = xa_erase_irq(&lru->xa, src_idx);
 
 	/*
 	 * The __list_lru_walk_one() can walk the list of this node.
@@ -383,78 +364,27 @@  static void memcg_list_lru_free(struct l
 		kvfree_rcu(mlru, rcu);
 }
 
-static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
 {
-	struct list_lru_memcg *mlrus;
-	int size = memcg_nr_cache_ids;
-
+	if (memcg_aware)
+		xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ);
 	lru->memcg_aware = memcg_aware;
-	if (!memcg_aware)
-		return 0;
-
-	spin_lock_init(&lru->lock);
-
-	mlrus = kvzalloc(struct_size(mlrus, mlru, size), GFP_KERNEL);
-	if (!mlrus)
-		return -ENOMEM;
-
-	RCU_INIT_POINTER(lru->mlrus, mlrus);
-
-	return 0;
 }
 
 static void memcg_destroy_list_lru(struct list_lru *lru)
 {
-	struct list_lru_memcg *mlrus;
+	XA_STATE(xas, &lru->xa, 0);
+	struct list_lru_per_memcg *mlru;
 
 	if (!list_lru_memcg_aware(lru))
 		return;
 
-	/*
-	 * This is called when shrinker has already been unregistered,
-	 * and nobody can use it. So, there is no need to use kvfree_rcu().
-	 */
-	mlrus = rcu_dereference_protected(lru->mlrus, true);
-	memcg_destroy_list_lru_range(mlrus, 0, memcg_nr_cache_ids);
-	kvfree(mlrus);
-}
-
-static int memcg_update_list_lru(struct list_lru *lru, int old_size, int new_size)
-{
-	struct list_lru_memcg *old, *new;
-
-	BUG_ON(old_size > new_size);
-
-	old = rcu_dereference_protected(lru->mlrus,
-					lockdep_is_held(&list_lrus_mutex));
-	new = kvmalloc(struct_size(new, mlru, new_size), GFP_KERNEL);
-	if (!new)
-		return -ENOMEM;
-
-	spin_lock_irq(&lru->lock);
-	memcpy(&new->mlru, &old->mlru, flex_array_size(new, mlru, old_size));
-	memset(&new->mlru[old_size], 0, flex_array_size(new, mlru, new_size - old_size));
-	rcu_assign_pointer(lru->mlrus, new);
-	spin_unlock_irq(&lru->lock);
-
-	kvfree_rcu(old, rcu);
-	return 0;
-}
-
-int memcg_update_all_list_lrus(int new_size)
-{
-	int ret = 0;
-	struct list_lru *lru;
-	int old_size = memcg_nr_cache_ids;
-
-	mutex_lock(&list_lrus_mutex);
-	list_for_each_entry(lru, &memcg_list_lrus, list) {
-		ret = memcg_update_list_lru(lru, old_size, new_size);
-		if (ret)
-			break;
+	xas_lock_irq(&xas);
+	xas_for_each(&xas, mlru, ULONG_MAX) {
+		kfree(mlru);
+		xas_store(&xas, NULL);
 	}
-	mutex_unlock(&list_lrus_mutex);
-	return ret;
+	xas_unlock_irq(&xas);
 }
 
 static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid,
@@ -521,7 +451,7 @@  void memcg_reparent_list_lrus(struct mem
 		struct mem_cgroup *child;
 
 		child = mem_cgroup_from_css(css);
-		child->kmemcg_id = parent->kmemcg_id;
+		WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id);
 	}
 	rcu_read_unlock();
 
@@ -531,21 +461,12 @@  void memcg_reparent_list_lrus(struct mem
 	mutex_unlock(&list_lrus_mutex);
 }
 
-static bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
-				     struct list_lru *lru)
+static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
+					    struct list_lru *lru)
 {
-	bool allocated;
-	int idx;
-
-	idx = memcg->kmemcg_id;
-	if (unlikely(idx < 0))
-		return true;
+	int idx = memcg->kmemcg_id;
 
-	rcu_read_lock();
-	allocated = !!rcu_access_pointer(rcu_dereference(lru->mlrus)->mlru[idx]);
-	rcu_read_unlock();
-
-	return allocated;
+	return idx < 0 || xa_load(&lru->xa, idx);
 }
 
 int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
@@ -553,11 +474,11 @@  int memcg_list_lru_alloc(struct mem_cgro
 {
 	int i;
 	unsigned long flags;
-	struct list_lru_memcg *mlrus;
 	struct list_lru_memcg_table {
 		struct list_lru_per_memcg *mlru;
 		struct mem_cgroup *memcg;
 	} *table;
+	XA_STATE(xas, &lru->xa, 0);
 
 	if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
 		return 0;
@@ -586,27 +507,48 @@  int memcg_list_lru_alloc(struct mem_cgro
 		}
 	}
 
-	spin_lock_irqsave(&lru->lock, flags);
-	mlrus = rcu_dereference_protected(lru->mlrus, true);
+	xas_lock_irqsave(&xas, flags);
 	while (i--) {
-		int index = table[i].memcg->kmemcg_id;
+		int index = READ_ONCE(table[i].memcg->kmemcg_id);
 		struct list_lru_per_memcg *mlru = table[i].mlru;
 
-		if (index < 0 || rcu_dereference_protected(mlrus->mlru[index], true))
+		xas_set(&xas, index);
+retry:
+		if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) {
 			kfree(mlru);
-		else
-			rcu_assign_pointer(mlrus->mlru[index], mlru);
+		} else {
+			xas_store(&xas, mlru);
+			if (xas_error(&xas) == -ENOMEM) {
+				xas_unlock_irqrestore(&xas, flags);
+				if (xas_nomem(&xas, gfp))
+					xas_set_err(&xas, 0);
+				xas_lock_irqsave(&xas, flags);
+				/*
+				 * The xas lock has been released, this memcg
+				 * can be reparented before us. So reload
+				 * memcg id. More details see the comments
+				 * in memcg_reparent_list_lrus().
+				 */
+				index = READ_ONCE(table[i].memcg->kmemcg_id);
+				if (index < 0)
+					xas_set_err(&xas, 0);
+				else if (!xas_error(&xas) && index != xas.xa_index)
+					xas_set(&xas, index);
+				goto retry;
+			}
+		}
 	}
-	spin_unlock_irqrestore(&lru->lock, flags);
-
+	/* xas_nomem() is used to free memory instead of memory allocation. */
+	if (xas.xa_alloc)
+		xas_nomem(&xas, gfp);
+	xas_unlock_irqrestore(&xas, flags);
 	kfree(table);
 
-	return 0;
+	return xas_error(&xas);
 }
 #else
-static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
 {
-	return 0;
 }
 
 static void memcg_destroy_list_lru(struct list_lru *lru)
@@ -618,7 +560,6 @@  int __list_lru_init(struct list_lru *lru
 		    struct lock_class_key *key, struct shrinker *shrinker)
 {
 	int i;
-	int err = -ENOMEM;
 
 #ifdef CONFIG_MEMCG_KMEM
 	if (shrinker)
@@ -626,11 +567,10 @@  int __list_lru_init(struct list_lru *lru
 	else
 		lru->shrinker_id = -1;
 #endif
-	memcg_get_cache_ids();
 
 	lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
 	if (!lru->node)
-		goto out;
+		return -ENOMEM;
 
 	for_each_node(i) {
 		spin_lock_init(&lru->node[i].lock);
@@ -639,18 +579,10 @@  int __list_lru_init(struct list_lru *lru
 		init_one_lru(&lru->node[i].lru);
 	}
 
-	err = memcg_init_list_lru(lru, memcg_aware);
-	if (err) {
-		kfree(lru->node);
-		/* Do this so a list_lru_destroy() doesn't crash: */
-		lru->node = NULL;
-		goto out;
-	}
-
+	memcg_init_list_lru(lru, memcg_aware);
 	list_lru_register(lru);
-out:
-	memcg_put_cache_ids();
-	return err;
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(__list_lru_init);
 
@@ -660,8 +592,6 @@  void list_lru_destroy(struct list_lru *l
 	if (!lru->node)
 		return;
 
-	memcg_get_cache_ids();
-
 	list_lru_unregister(lru);
 
 	memcg_destroy_list_lru(lru);
@@ -671,6 +601,5 @@  void list_lru_destroy(struct list_lru *l
 #ifdef CONFIG_MEMCG_KMEM
 	lru->shrinker_id = -1;
 #endif
-	memcg_put_cache_ids();
 }
 EXPORT_SYMBOL_GPL(list_lru_destroy);
--- a/mm/memcontrol.c~mm-list_lru-replace-linear-array-with-xarray
+++ a/mm/memcontrol.c
@@ -351,42 +351,17 @@  static void memcg_reparent_objcgs(struct
  * This will be used as a shrinker list's index.
  * The main reason for not using cgroup id for this:
  *  this works better in sparse environments, where we have a lot of memcgs,
- *  but only a few kmem-limited. Or also, if we have, for instance, 200
- *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
- *  200 entry array for that.
- *
- * The current size of the caches array is stored in memcg_nr_cache_ids. It
- * will double each time we have to increase it.
+ *  but only a few kmem-limited.
  */
 static DEFINE_IDA(memcg_cache_ida);
-int memcg_nr_cache_ids;
-
-/* Protects memcg_nr_cache_ids */
-static DECLARE_RWSEM(memcg_cache_ids_sem);
-
-void memcg_get_cache_ids(void)
-{
-	down_read(&memcg_cache_ids_sem);
-}
-
-void memcg_put_cache_ids(void)
-{
-	up_read(&memcg_cache_ids_sem);
-}
 
 /*
- * MIN_SIZE is different than 1, because we would like to avoid going through
- * the alloc/free process all the time. In a small machine, 4 kmem-limited
- * cgroups is a reasonable guess. In the future, it could be a parameter or
- * tunable, but that is strictly not necessary.
- *
  * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
  * this constant directly from cgroup, but it is understandable that this is
  * better kept as an internal representation in cgroup.c. In any case, the
  * cgrp_id space is not getting any smaller, and we don't have to necessarily
  * increase ours as well if it increases.
  */
-#define MEMCG_CACHES_MIN_SIZE 4
 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
 
 /*
@@ -2944,49 +2919,6 @@  __always_inline struct obj_cgroup *get_o
 	return objcg;
 }
 
-static int memcg_alloc_cache_id(void)
-{
-	int id, size;
-	int err;
-
-	id = ida_simple_get(&memcg_cache_ida,
-			    0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
-	if (id < 0)
-		return id;
-
-	if (id < memcg_nr_cache_ids)
-		return id;
-
-	/*
-	 * There's no space for the new id in memcg_caches arrays,
-	 * so we have to grow them.
-	 */
-	down_write(&memcg_cache_ids_sem);
-
-	size = 2 * (id + 1);
-	if (size < MEMCG_CACHES_MIN_SIZE)
-		size = MEMCG_CACHES_MIN_SIZE;
-	else if (size > MEMCG_CACHES_MAX_SIZE)
-		size = MEMCG_CACHES_MAX_SIZE;
-
-	err = memcg_update_all_list_lrus(size);
-	if (!err)
-		memcg_nr_cache_ids = size;
-
-	up_write(&memcg_cache_ids_sem);
-
-	if (err) {
-		ida_simple_remove(&memcg_cache_ida, id);
-		return err;
-	}
-	return id;
-}
-
-static void memcg_free_cache_id(int id)
-{
-	ida_simple_remove(&memcg_cache_ida, id);
-}
-
 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
 {
 	mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
@@ -3673,13 +3605,14 @@  static int memcg_online_kmem(struct mem_
 	if (unlikely(mem_cgroup_is_root(memcg)))
 		return 0;
 
-	memcg_id = memcg_alloc_cache_id();
+	memcg_id = ida_alloc_max(&memcg_cache_ida, MEMCG_CACHES_MAX_SIZE - 1,
+				 GFP_KERNEL);
 	if (memcg_id < 0)
 		return memcg_id;
 
 	objcg = obj_cgroup_alloc();
 	if (!objcg) {
-		memcg_free_cache_id(memcg_id);
+		ida_free(&memcg_cache_ida, memcg_id);
 		return -ENOMEM;
 	}
 	objcg->memcg = memcg;
@@ -3723,7 +3656,7 @@  static void memcg_offline_kmem(struct me
 	 */
 	memcg_reparent_list_lrus(memcg, parent);
 
-	memcg_free_cache_id(kmemcg_id);
+	ida_free(&memcg_cache_ida, kmemcg_id);
 }
 #else
 static int memcg_online_kmem(struct mem_cgroup *memcg)

[057/227] mm: list_lru: replace linear array with xarray

Commit Message

Patch