[v6,5/9] mm: multigenerational lru: mm_struct list

Message ID	20220104202227.2903605-6-yuzhao@google.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-arm-kernel-bounces+linux-arm-kernel=archiver.kernel.org@lists.infradead.org> Date: Tue, 4 Jan 2022 13:22:24 -0700 In-Reply-To: <20220104202227.2903605-1-yuzhao@google.com> Message-Id: <20220104202227.2903605-6-yuzhao@google.com> Mime-Version: 1.0 References: <20220104202227.2903605-1-yuzhao@google.com> Subject: [PATCH v6 5/9] mm: multigenerational lru: mm_struct list From: Yu Zhao <yuzhao@google.com> To: Andrew Morton <akpm@linux-foundation.org>, Linus Torvalds <torvalds@linux-foundation.org> Cc: Andi Kleen <ak@linux.intel.com>, Catalin Marinas <catalin.marinas@arm.com>, Dave Hansen <dave.hansen@linux.intel.com>, Hillf Danton <hdanton@sina.com>, Jens Axboe <axboe@kernel.dk>, Jesse Barnes <jsbarnes@google.com>, Johannes Weiner <hannes@cmpxchg.org>, Jonathan Corbet <corbet@lwn.net>, Matthew Wilcox <willy@infradead.org>, Mel Gorman <mgorman@suse.de>, Michael Larabel <Michael@michaellarabel.com>, Michal Hocko <mhocko@kernel.org>, Rik van Riel <riel@surriel.com>, Vlastimil Babka <vbabka@suse.cz>, Will Deacon <will@kernel.org>, Ying Huang <ying.huang@intel.com>, linux-arm-kernel@lists.infradead.org, linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org, page-reclaim@google.com, x86@kernel.org, Yu Zhao <yuzhao@google.com>, Konstantin Kharlamov <Hi-Angel@yandex.ru> Precedence: list Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Sender: "linux-arm-kernel" <linux-arm-kernel-bounces@lists.infradead.org> Errors-To: linux-arm-kernel-bounces+linux-arm-kernel=archiver.kernel.org@lists.infradead.org
Series	Multigenerational LRU Framework \| expand [v6,0/9] Multigenerational LRU Framework [v6,1/9] mm: x86, arm64: add arch_has_hw_pte_young() [v6,2/9] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG [v6,3/9] mm/vmscan.c: refactor shrink_node() [v6,4/9] mm: multigenerational lru: groundwork [v6,5/9] mm: multigenerational lru: mm_struct list [v6,6/9] mm: multigenerational lru: aging [v6,7/9] mm: multigenerational lru: eviction [v6,8/9] mm: multigenerational lru: user interface [v6,9/9] mm: multigenerational lru: Kconfig

diff --git a/fs/exec.c b/fs/exec.c index 537d92c41105..308aa88ca15f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1005,6 +1005,7 @@ static int exec_mmap(struct mm_struct *mm) active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; + lru_gen_add_mm(mm); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for @@ -1015,6 +1016,7 @@ static int exec_mmap(struct mm_struct *mm) if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); activate_mm(active_mm, mm); + lru_gen_use_mm(mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); tsk->mm->vmacache_seqnum = 0; diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0c5c403f4be6..aba18cd101db 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -340,6 +340,11 @@ struct mem_cgroup { struct deferred_split deferred_split_queue; #endif +#ifdef CONFIG_LRU_GEN + /* per-memcg mm_struct list */ + struct lru_gen_mm_list mm_list; +#endif + struct mem_cgroup_per_node *nodeinfo[]; }; diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 5f239f67f36b..717a2290acb3 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -110,6 +110,11 @@ static inline int lru_gen_from_seq(unsigned long seq) return seq % MAX_NR_GENS; } +static inline int lru_hist_from_seq(unsigned long seq) +{ + return seq % NR_HIST_GENS; +} + static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) { unsigned long max_seq = lruvec->lrugen.max_seq; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c3a6e6209600..bdbd9390adb3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -3,6 +3,7 @@ #define _LINUX_MM_TYPES_H #include <linux/mm_types_task.h> +#include <linux/sched.h> #include <linux/auxvec.h> #include <linux/list.h> @@ -16,6 +17,8 @@ #include <linux/page-flags-layout.h> #include <linux/workqueue.h> #include <linux/seqlock.h> +#include <linux/nodemask.h> +#include <linux/mmdebug.h> #include <asm/mmu.h> @@ -646,6 +649,22 @@ struct mm_struct { #ifdef CONFIG_IOMMU_SUPPORT u32 pasid; #endif +#ifdef CONFIG_LRU_GEN + struct { + /* this mm_struct is on lru_gen_mm_list */ + struct list_head list; +#ifdef CONFIG_MEMCG + /* points to the memcg of "owner" above */ + struct mem_cgroup *memcg; +#endif + /* + * Set when switching to this mm_struct, as a hint of + * whether it has been used since the last time per-node + * page table walkers cleared the corresponding bits. + */ + nodemask_t nodes; + } lru_gen; +#endif /* CONFIG_LRU_GEN */ } __randomize_layout; /* @@ -672,6 +691,65 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) return (struct cpumask *)&mm->cpu_bitmap; } +#ifdef CONFIG_LRU_GEN + +struct lru_gen_mm_list { + /* mm_struct list for page table walkers */ + struct list_head fifo; + /* protects the list above */ + spinlock_t lock; +}; + +void lru_gen_add_mm(struct mm_struct *mm); +void lru_gen_del_mm(struct mm_struct *mm); +#ifdef CONFIG_MEMCG +void lru_gen_migrate_mm(struct mm_struct *mm); +#endif + +static inline void lru_gen_init_mm(struct mm_struct *mm) +{ + INIT_LIST_HEAD(&mm->lru_gen.list); +#ifdef CONFIG_MEMCG + mm->lru_gen.memcg = NULL; +#endif + nodes_clear(mm->lru_gen.nodes); +} + +static inline void lru_gen_use_mm(struct mm_struct *mm) +{ + /* unlikely but not a bug when racing with lru_gen_migrate_mm() */ + VM_WARN_ON(list_empty(&mm->lru_gen.list)); + + if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lru_gen.nodes)) + nodes_setall(mm->lru_gen.nodes); +} + +#else /* !CONFIG_LRU_GEN */ + +static inline void lru_gen_add_mm(struct mm_struct *mm) +{ +} + +static inline void lru_gen_del_mm(struct mm_struct *mm) +{ +} + +#ifdef CONFIG_MEMCG +static inline void lru_gen_migrate_mm(struct mm_struct *mm) +{ +} +#endif + +static inline void lru_gen_init_mm(struct mm_struct *mm) +{ +} + +static inline void lru_gen_use_mm(struct mm_struct *mm) +{ +} + +#endif /* CONFIG_LRU_GEN */ + struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 371c7210d510..5b9bc2532c5b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -335,6 +335,13 @@ struct lruvec; #define MIN_NR_GENS 2U #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) +/* whether to keep historical stats for evicted generations */ +#ifdef CONFIG_LRU_GEN_STATS +#define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) +#else +#define NR_HIST_GENS 1U +#endif + struct lru_gen_struct { /* the aging increments the youngest generation number */ unsigned long max_seq; @@ -350,6 +357,58 @@ struct lru_gen_struct { bool enabled; }; +enum { + MM_PTE_TOTAL, /* total leaf entries */ + MM_PTE_OLD, /* old leaf entries */ + MM_PTE_YOUNG, /* young leaf entries */ + MM_PMD_TOTAL, /* total non-leaf entries */ + MM_PMD_FOUND, /* non-leaf entries found in Bloom filters */ + MM_PMD_ADDED, /* non-leaf entries added to Bloom filters */ + NR_MM_STATS +}; + +/* mnemonic codes for the mm stats above */ +#define MM_STAT_CODES "toydfa" + +/* double-buffering Bloom filters */ +#define NR_BLOOM_FILTERS 2 + +struct lru_gen_mm_state { + /* set to max_seq after each iteration */ + unsigned long seq; + /* where the current iteration starts (inclusive) */ + struct list_head *head; + /* where the last iteration ends (exclusive) */ + struct list_head *tail; + /* to wait for the last page table walker to finish */ + struct wait_queue_head wait; + /* Bloom filters flip after each iteration */ + unsigned long *filters[NR_BLOOM_FILTERS]; + /* the mm stats for debugging */ + unsigned long stats[NR_HIST_GENS][NR_MM_STATS]; + /* the number of concurrent page table walkers */ + int nr_walkers; +}; + +struct lru_gen_mm_walk { + /* the lruvec under reclaim */ + struct lruvec *lruvec; + /* unstable max_seq from lru_gen_struct */ + unsigned long max_seq; + /* the next address within an mm to scan */ + unsigned long next_addr; + /* to batch page table entries */ + unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)]; + /* to batch promoted pages */ + int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* to batch the mm stats */ + int mm_stats[NR_MM_STATS]; + /* total batched items */ + int batched; + bool can_swap; + bool full_scan; +}; + void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); #ifdef CONFIG_MEMCG @@ -395,6 +454,8 @@ struct lruvec { #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ struct lru_gen_struct lrugen; + /* to concurrently iterate lru_gen_mm_list */ + struct lru_gen_mm_state mm_state; #endif #ifdef CONFIG_MEMCG struct pglist_data *pgdat; diff --git a/kernel/exit.c b/kernel/exit.c index f702a6a63686..f8bf605c9ba5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -463,6 +463,7 @@ void mm_update_next_owner(struct mm_struct *mm) goto retry; } WRITE_ONCE(mm->owner, c); + lru_gen_migrate_mm(mm); task_unlock(c); put_task_struct(c); } diff --git a/kernel/fork.c b/kernel/fork.c index 3244cc56b697..be1b58bf11bb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1078,6 +1078,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, goto fail_nocontext; mm->user_ns = get_user_ns(user_ns); + lru_gen_init_mm(mm); return mm; fail_nocontext: @@ -1120,6 +1121,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + lru_gen_del_mm(mm); mmdrop(mm); } @@ -2603,6 +2605,13 @@ pid_t kernel_clone(struct kernel_clone_args *args) get_task_struct(p); } + if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { + /* lock the task to synchronize with memcg migration */ + task_lock(p); + lru_gen_add_mm(p->mm); + task_unlock(p); + } + wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 77563109c0ea..268b869d326e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4956,6 +4956,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * finish_task_switch()'s mmdrop(). */ switch_mm_irqs_off(prev->active_mm, next->mm, next); + lru_gen_use_mm(next->mm); if (!prev->mm) { // from kernel /* will mmdrop() in finish_task_switch(). */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a4359a278e31..33576f6814b5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6135,6 +6135,29 @@ static void mem_cgroup_move_task(void) } #endif +#ifdef CONFIG_LRU_GEN +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + struct task_struct *task = NULL; + + cgroup_taskset_for_each_leader(task, css, tset) + break; + + if (!task) + return; + + task_lock(task); + if (task->mm && task->mm->owner == task) + lru_gen_migrate_mm(task->mm); + task_unlock(task); +} +#else +static void mem_cgroup_attach(struct cgroup_taskset *tset) +{ +} +#endif /* CONFIG_LRU_GEN */ + static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { if (value == PAGE_COUNTER_MAX) @@ -6478,6 +6501,7 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .css_rstat_flush = mem_cgroup_css_rstat_flush, .can_attach = mem_cgroup_can_attach, + .attach = mem_cgroup_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, .dfl_cftypes = memory_files, diff --git a/mm/vmscan.c b/mm/vmscan.c index 0e487c0ffe17..5eaf22aa446a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3095,6 +3095,342 @@ static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) get_nr_gens(lruvec, 0) <= MAX_NR_GENS; } +/****************************************************************************** + * mm_struct list + ******************************************************************************/ + +static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg) +{ + static struct lru_gen_mm_list mm_list = { + .fifo = LIST_HEAD_INIT(mm_list.fifo), + .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock), + }; + +#ifdef CONFIG_MEMCG + if (memcg) + return &memcg->mm_list; +#endif + return &mm_list; +} + +void lru_gen_add_mm(struct mm_struct *mm) +{ + int nid; + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + + VM_BUG_ON_MM(!list_empty(&mm->lru_gen.list), mm); +#ifdef CONFIG_MEMCG + VM_BUG_ON_MM(mm->lru_gen.memcg, mm); + mm->lru_gen.memcg = memcg; +#endif + spin_lock(&mm_list->lock); + + list_add_tail(&mm->lru_gen.list, &mm_list->fifo); + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + if (!lruvec) + continue; + + if (lruvec->mm_state.tail == &mm_list->fifo) + lruvec->mm_state.tail = lruvec->mm_state.tail->prev; + } + + spin_unlock(&mm_list->lock); +} + +void lru_gen_del_mm(struct mm_struct *mm) +{ + int nid; + struct lru_gen_mm_list *mm_list; + struct mem_cgroup *memcg = NULL; + + if (list_empty(&mm->lru_gen.list)) + return; + +#ifdef CONFIG_MEMCG + memcg = mm->lru_gen.memcg; +#endif + mm_list = get_mm_list(memcg); + + spin_lock(&mm_list->lock); + + for_each_node(nid) { + struct lruvec *lruvec = get_lruvec(memcg, nid); + + if (!lruvec) + continue; + + if (lruvec->mm_state.tail == &mm->lru_gen.list) + lruvec->mm_state.tail = lruvec->mm_state.tail->next; + + if (lruvec->mm_state.head != &mm->lru_gen.list) + continue; + + lruvec->mm_state.head = lruvec->mm_state.head->next; + if (lruvec->mm_state.head == &mm_list->fifo) + WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1); + } + + list_del_init(&mm->lru_gen.list); + + spin_unlock(&mm_list->lock); + +#ifdef CONFIG_MEMCG + mem_cgroup_put(mm->lru_gen.memcg); + mm->lru_gen.memcg = NULL; +#endif +} + +#ifdef CONFIG_MEMCG +void lru_gen_migrate_mm(struct mm_struct *mm) +{ + struct mem_cgroup *memcg; + + lockdep_assert_held(&mm->owner->alloc_lock); + + if (mem_cgroup_disabled()) + return; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(mm->owner); + rcu_read_unlock(); + if (memcg == mm->lru_gen.memcg) + return; + + VM_BUG_ON_MM(!mm->lru_gen.memcg, mm); + VM_BUG_ON_MM(list_empty(&mm->lru_gen.list), mm); + + lru_gen_del_mm(mm); + lru_gen_add_mm(mm); +} +#endif + +/* + * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when + * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of + * bits in a bitmap, k is the number of hash functions and n is the number of + * inserted items. + * + * Page table walkers use one of the two filters to reduce their search space. + * To get rid of non-leaf entries that no longer have enough leaf entries, the + * aging uses the double-buffering technique to flip to the other filter each + * time it creates a new generation. For non-leaf entries that have enough + * leaf entries, the aging carries them over to the next generation in + * walk_pmd_range(); the eviction also report them when walking the rmap + * in lru_gen_look_around(). + * + * For future optimizations: + * 1) It's not necessary to keep both filters all the time. The spare one can be + * freed after the RCU grace period and reallocated if needed again. + * 2) And when reallocating, it's worth scaling its size according to the number + * of inserted entries in the other filter, to reduce the memory overhead on + * small systems and false positives on large systems. + * 3) Jenkins' hash function is an alternative to Knuth's. + */ +#define BLOOM_FILTER_SHIFT 15 + +static inline int filter_gen_from_seq(unsigned long seq) +{ + return seq % NR_BLOOM_FILTERS; +} + +static void get_item_key(void *item, int *key) +{ + u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2); + + BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32)); + + key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1); + key[1] = hash >> BLOOM_FILTER_SHIFT; +} + +static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq) +{ + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); + + filter = lruvec->mm_state.filters[gen]; + if (filter) { + bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT)); + return; + } + + filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC); + WRITE_ONCE(lruvec->mm_state.filters[gen], filter); +} + +static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return; + + get_item_key(item, key); + + if (!test_bit(key[0], filter)) + set_bit(key[0], filter); + if (!test_bit(key[1], filter)) + set_bit(key[1], filter); +} + +static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item) +{ + int key[2]; + unsigned long *filter; + int gen = filter_gen_from_seq(seq); + + filter = READ_ONCE(lruvec->mm_state.filters[gen]); + if (!filter) + return false; + + get_item_key(item, key); + + return test_bit(key[0], filter) && test_bit(key[1], filter); +} + +static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) +{ + int i; + int hist = lru_hist_from_seq(walk->max_seq); + + lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock); + + for (i = 0; i < NR_MM_STATS; i++) { + WRITE_ONCE(lruvec->mm_state.stats[hist][i], + lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]); + walk->mm_stats[i] = 0; + } + + if (NR_HIST_GENS == 1 || !last) + return; + + hist = lru_hist_from_seq(walk->max_seq + 1); + for (i = 0; i < NR_MM_STATS; i++) + WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0); +} + +static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) +{ + int type; + unsigned long size = 0; + struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); + + if (!walk->full_scan && cpumask_empty(mm_cpumask(mm)) && + !node_isset(pgdat->node_id, mm->lru_gen.nodes)) + return true; + + for (type = !walk->can_swap; type < ANON_AND_FILE; type++) { + size += type ? get_mm_counter(mm, MM_FILEPAGES) : + get_mm_counter(mm, MM_ANONPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); + } + + if (size < MIN_LRU_BATCH) + return true; + + if (mm_is_oom_victim(mm)) + return true; + + if (!mmget_not_zero(mm)) + return true; + + node_clear(pgdat->node_id, mm->lru_gen.nodes); + + return false; +} + +static bool get_next_mm(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, + struct mm_struct **iter) +{ + bool first = false; + bool last = true; + struct mm_struct *mm = NULL; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); + struct lru_gen_mm_state *mm_state = &lruvec->mm_state; + + /* + * There are four interesting cases for this page table walker: + * 1) It tries to start a new iteration of this list with a stale + * max_seq; there is nothing to be done. + * 2) It's the first of the current generation, and it needs to prepare + * the Bloom filter for the next generation. + * 3) It reaches the end of this list, and it needs to increment + * mm_state->seq; the iteration is done. + * 4) It's the last of the current generation, and it needs to clear the + * historical mm stats for the next generation. + */ + if (*iter) + mmput_async(*iter); + else if (walk->max_seq <= READ_ONCE(mm_state->seq)) + return false; + + spin_lock(&mm_list->lock); + + VM_BUG_ON(walk->max_seq > mm_state->seq + 1); + VM_BUG_ON(*iter && walk->max_seq < mm_state->seq); + VM_BUG_ON(*iter && !mm_state->nr_walkers); + + if (walk->max_seq <= mm_state->seq) { + if (!*iter) + last = false; + goto done; + } + + if (mm_state->head == &mm_list->fifo) { + VM_BUG_ON(mm_state->nr_walkers); + mm_state->head = mm_state->head->next; + first = true; + } + + while (!mm && mm_state->head != &mm_list->fifo) { + mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list); + + mm_state->head = mm_state->head->next; + + /* full scan for those added after the last iteration */ + if (mm_state->tail == &mm->lru_gen.list) { + mm_state->tail = mm_state->tail->next; + walk->full_scan = true; + } + + if (should_skip_mm(mm, walk)) + mm = NULL; + } + + if (mm_state->head == &mm_list->fifo) + WRITE_ONCE(mm_state->seq, mm_state->seq + 1); +done: + if (*iter && !mm) + mm_state->nr_walkers--; + if (!*iter && mm) + mm_state->nr_walkers++; + + if (mm_state->nr_walkers) + last = false; + + if (mm && first) + clear_bloom_filter(lruvec, walk->max_seq + 1); + + if (*iter || last) + reset_mm_stats(lruvec, walk, last); + + spin_unlock(&mm_list->lock); + + *iter = mm; + + return last; +} + /****************************************************************************** * state change ******************************************************************************/ @@ -3252,6 +3588,7 @@ void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) int i; int gen, type, zone; struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_mm_list *mm_list = get_mm_list(memcg); lrugen->max_seq = MIN_NR_GENS + 1; lrugen->enabled = lru_gen_enabled(); @@ -3261,6 +3598,11 @@ void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) for_each_gen_type_zone(gen, type, zone) INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); + + lruvec->mm_state.seq = MIN_NR_GENS; + lruvec->mm_state.head = &mm_list->fifo; + lruvec->mm_state.tail = &mm_list->fifo; + init_waitqueue_head(&lruvec->mm_state.wait); } #ifdef CONFIG_MEMCG @@ -3268,6 +3610,9 @@ void lru_gen_init_memcg(struct mem_cgroup *memcg) { int nid; + INIT_LIST_HEAD(&memcg->mm_list.fifo); + spin_lock_init(&memcg->mm_list.lock); + for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); @@ -3280,10 +3625,16 @@ void lru_gen_free_memcg(struct mem_cgroup *memcg) int nid; for_each_node(nid) { + int i; struct lruvec *lruvec = get_lruvec(memcg, nid); VM_BUG_ON(memchr_inv(lruvec->lrugen.nr_pages, 0, sizeof(lruvec->lrugen.nr_pages))); + + for (i = 0; i < NR_BLOOM_FILTERS; i++) { + bitmap_free(lruvec->mm_state.filters[i]); + lruvec->mm_state.filters[i] = NULL; + } } } #endif @@ -3292,6 +3643,7 @@ static int __init init_lru_gen(void) { BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); + BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1); return 0; };

[v6,5/9] mm: multigenerational lru: mm_struct list

Commit Message

Comments

Patch