@@ -1005,6 +1005,7 @@ static int exec_mmap(struct mm_struct *mm)
active_mm = tsk->active_mm;
tsk->active_mm = mm;
tsk->mm = mm;
+ lru_gen_add_mm(mm);
/*
* This prevents preemption while active_mm is being loaded and
* it and mm are being updated, which could cause problems for
@@ -1015,6 +1016,7 @@ static int exec_mmap(struct mm_struct *mm)
if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
activate_mm(active_mm, mm);
+ lru_gen_switch_mm(active_mm, mm);
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
tsk->mm->vmacache_seqnum = 0;
@@ -230,6 +230,8 @@ struct obj_cgroup {
};
};
+struct lru_gen_mm_list;
+
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
@@ -349,6 +351,10 @@ struct mem_cgroup {
struct deferred_split deferred_split_queue;
#endif
+#ifdef CONFIG_LRU_GEN
+ struct lru_gen_mm_list *mm_list;
+#endif
+
struct mem_cgroup_per_node *nodeinfo[];
};
@@ -15,6 +15,8 @@
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/seqlock.h>
+#include <linux/nodemask.h>
+#include <linux/mmdebug.h>
#include <asm/mmu.h>
@@ -571,6 +573,22 @@ struct mm_struct {
#ifdef CONFIG_IOMMU_SUPPORT
u32 pasid;
+#endif
+#ifdef CONFIG_LRU_GEN
+ struct {
+ /* the node of a global or per-memcg mm_struct list */
+ struct list_head list;
+#ifdef CONFIG_MEMCG
+ /* points to the memcg of the owner task above */
+ struct mem_cgroup *memcg;
+#endif
+ /* whether this mm_struct has been used since the last walk */
+ nodemask_t nodes;
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ /* the number of CPUs using this mm_struct */
+ atomic_t nr_cpus;
+#endif
+ } lrugen;
#endif
} __randomize_layout;
@@ -598,6 +616,95 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
return (struct cpumask *)&mm->cpu_bitmap;
}
+#ifdef CONFIG_LRU_GEN
+
+void lru_gen_init_mm(struct mm_struct *mm);
+void lru_gen_add_mm(struct mm_struct *mm);
+void lru_gen_del_mm(struct mm_struct *mm);
+#ifdef CONFIG_MEMCG
+int lru_gen_alloc_mm_list(struct mem_cgroup *memcg);
+void lru_gen_free_mm_list(struct mem_cgroup *memcg);
+void lru_gen_migrate_mm(struct mm_struct *mm);
+#endif
+
+/* Track the usage of each mm_struct so that we can skip inactive ones. */
+static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new)
+{
+ /* exclude init_mm, efi_mm, etc. */
+ if (!core_kernel_data((unsigned long)old)) {
+ VM_BUG_ON(old == &init_mm);
+
+ nodes_setall(old->lrugen.nodes);
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ atomic_dec(&old->lrugen.nr_cpus);
+ VM_BUG_ON_MM(atomic_read(&old->lrugen.nr_cpus) < 0, old);
+#endif
+ } else
+ VM_BUG_ON_MM(READ_ONCE(old->lrugen.list.prev) ||
+ READ_ONCE(old->lrugen.list.next), old);
+
+ if (!core_kernel_data((unsigned long)new)) {
+ VM_BUG_ON(new == &init_mm);
+
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ atomic_inc(&new->lrugen.nr_cpus);
+ VM_BUG_ON_MM(atomic_read(&new->lrugen.nr_cpus) < 0, new);
+#endif
+ } else
+ VM_BUG_ON_MM(READ_ONCE(new->lrugen.list.prev) ||
+ READ_ONCE(new->lrugen.list.next), new);
+}
+
+/* Return whether this mm_struct is being used on any CPUs. */
+static inline bool lru_gen_mm_is_active(struct mm_struct *mm)
+{
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ return !cpumask_empty(mm_cpumask(mm));
+#else
+ return atomic_read(&mm->lrugen.nr_cpus);
+#endif
+}
+
+#else /* CONFIG_LRU_GEN */
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_add_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_del_mm(struct mm_struct *mm)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline int lru_gen_alloc_mm_list(struct mem_cgroup *memcg)
+{
+ return 0;
+}
+
+static inline void lru_gen_free_mm_list(struct mem_cgroup *memcg)
+{
+}
+
+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+}
+#endif
+
+static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct *new)
+{
+}
+
+static inline bool lru_gen_mm_is_active(struct mm_struct *mm)
+{
+ return false;
+}
+
+#endif /* CONFIG_LRU_GEN */
+
struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
@@ -422,6 +422,7 @@ void mm_update_next_owner(struct mm_struct *mm)
goto retry;
}
WRITE_ONCE(mm->owner, c);
+ lru_gen_migrate_mm(mm);
task_unlock(c);
put_task_struct(c);
}
@@ -669,6 +669,7 @@ static void check_mm(struct mm_struct *mm)
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
+ VM_BUG_ON_MM(lru_gen_mm_is_active(mm), mm);
}
#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
@@ -1066,6 +1067,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
goto fail_nocontext;
mm->user_ns = get_user_ns(user_ns);
+ lru_gen_init_mm(mm);
return mm;
fail_nocontext:
@@ -1108,6 +1110,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ lru_gen_del_mm(mm);
mmdrop(mm);
}
@@ -2530,6 +2533,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
get_task_struct(p);
}
+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+ /* lock the task to synchronize with memcg migration */
+ task_lock(p);
+ lru_gen_add_mm(p->mm);
+ task_unlock(p);
+ }
+
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
@@ -1361,6 +1361,7 @@ void kthread_use_mm(struct mm_struct *mm)
tsk->mm = mm;
membarrier_update_current_mm(mm);
switch_mm_irqs_off(active_mm, mm, tsk);
+ lru_gen_switch_mm(active_mm, mm);
local_irq_enable();
task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
@@ -4665,6 +4665,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ lru_gen_switch_mm(prev->active_mm, next->mm);
if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */
@@ -8391,6 +8392,7 @@ void idle_task_exit(void)
if (mm != &init_mm) {
switch_mm(mm, &init_mm, current);
+ lru_gen_switch_mm(mm, &init_mm);
finish_arch_post_lock_switch();
}
@@ -5172,6 +5172,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
+ lru_gen_free_mm_list(memcg);
kfree(memcg);
}
@@ -5221,6 +5222,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
if (alloc_mem_cgroup_per_node_info(memcg, node))
goto fail;
+ if (lru_gen_alloc_mm_list(memcg))
+ goto fail;
+
if (memcg_wb_domain_init(memcg, GFP_KERNEL))
goto fail;
@@ -6182,6 +6186,29 @@ static void mem_cgroup_move_task(void)
}
#endif
+#ifdef CONFIG_LRU_GEN
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *css;
+ struct task_struct *task = NULL;
+
+ cgroup_taskset_for_each_leader(task, css, tset)
+ break;
+
+ if (!task)
+ return;
+
+ task_lock(task);
+ if (task->mm && task->mm->owner == task)
+ lru_gen_migrate_mm(task->mm);
+ task_unlock(task);
+}
+#else
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+}
+#endif
+
static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
{
if (value == PAGE_COUNTER_MAX)
@@ -6523,6 +6550,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
.css_reset = mem_cgroup_css_reset,
.css_rstat_flush = mem_cgroup_css_rstat_flush,
.can_attach = mem_cgroup_can_attach,
+ .attach = mem_cgroup_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
.dfl_cftypes = memory_files,
@@ -2902,6 +2902,312 @@ static bool positive_ctrl_err(struct controller_pos *sp, struct controller_pos *
sp->refaulted * max(pv->total, 1UL) * pv->gain;
}
+/******************************************************************************
+ * mm_struct list
+ ******************************************************************************/
+
+enum {
+ MM_SCHED_ACTIVE, /* running processes */
+ MM_SCHED_INACTIVE, /* sleeping processes */
+ MM_LOCK_CONTENTION, /* lock contentions */
+ MM_VMA_INTERVAL, /* VMAs within the range of each PUD/PMD/PTE */
+ MM_LEAF_OTHER_NODE, /* entries not from the node under reclaim */
+ MM_LEAF_OTHER_MEMCG, /* entries not from the memcg under reclaim */
+ MM_LEAF_OLD, /* old entries */
+ MM_LEAF_YOUNG, /* young entries */
+ MM_LEAF_DIRTY, /* dirty entries */
+ MM_LEAF_HOLE, /* non-present entries */
+ MM_NONLEAF_OLD, /* old non-leaf PMD entries */
+ MM_NONLEAF_YOUNG, /* young non-leaf PMD entries */
+ NR_MM_STATS
+};
+
+/* mnemonic codes for the stats above */
+#define MM_STAT_CODES "aicvnmoydhlu"
+
+struct lru_gen_mm_list {
+ /* the head of a global or per-memcg mm_struct list */
+ struct list_head head;
+ /* protects the list */
+ spinlock_t lock;
+ struct {
+ /* set to max_seq after each round of walk */
+ unsigned long cur_seq;
+ /* the next mm_struct on the list to walk */
+ struct list_head *iter;
+ /* to wait for the last walker to finish */
+ struct wait_queue_head wait;
+ /* the number of concurrent walkers */
+ int nr_walkers;
+ /* stats for debugging */
+ unsigned long stats[NR_STAT_GENS][NR_MM_STATS];
+ } nodes[0];
+};
+
+static struct lru_gen_mm_list *global_mm_list;
+
+static struct lru_gen_mm_list *alloc_mm_list(void)
+{
+ int nid;
+ struct lru_gen_mm_list *mm_list;
+
+ mm_list = kvzalloc(struct_size(mm_list, nodes, nr_node_ids), GFP_KERNEL);
+ if (!mm_list)
+ return NULL;
+
+ INIT_LIST_HEAD(&mm_list->head);
+ spin_lock_init(&mm_list->lock);
+
+ for_each_node(nid) {
+ mm_list->nodes[nid].cur_seq = MIN_NR_GENS;
+ mm_list->nodes[nid].iter = &mm_list->head;
+ init_waitqueue_head(&mm_list->nodes[nid].wait);
+ }
+
+ return mm_list;
+}
+
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
+{
+#ifdef CONFIG_MEMCG
+ if (!mem_cgroup_disabled())
+ return memcg ? memcg->mm_list : root_mem_cgroup->mm_list;
+#endif
+ VM_BUG_ON(memcg);
+
+ return global_mm_list;
+}
+
+void lru_gen_init_mm(struct mm_struct *mm)
+{
+ INIT_LIST_HEAD(&mm->lrugen.list);
+#ifdef CONFIG_MEMCG
+ mm->lrugen.memcg = NULL;
+#endif
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ atomic_set(&mm->lrugen.nr_cpus, 0);
+#endif
+ nodes_clear(mm->lrugen.nodes);
+}
+
+void lru_gen_add_mm(struct mm_struct *mm)
+{
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+
+ VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm);
+#ifdef CONFIG_MEMCG
+ VM_BUG_ON_MM(mm->lrugen.memcg, mm);
+ WRITE_ONCE(mm->lrugen.memcg, memcg);
+#endif
+ spin_lock(&mm_list->lock);
+ list_add_tail(&mm->lrugen.list, &mm_list->head);
+ spin_unlock(&mm_list->lock);
+}
+
+void lru_gen_del_mm(struct mm_struct *mm)
+{
+ int nid;
+#ifdef CONFIG_MEMCG
+ struct lru_gen_mm_list *mm_list = get_mm_list(mm->lrugen.memcg);
+#else
+ struct lru_gen_mm_list *mm_list = get_mm_list(NULL);
+#endif
+
+ spin_lock(&mm_list->lock);
+
+ for_each_node(nid) {
+ if (mm_list->nodes[nid].iter != &mm->lrugen.list)
+ continue;
+
+ mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next;
+ if (mm_list->nodes[nid].iter == &mm_list->head)
+ WRITE_ONCE(mm_list->nodes[nid].cur_seq,
+ mm_list->nodes[nid].cur_seq + 1);
+ }
+
+ list_del_init(&mm->lrugen.list);
+
+ spin_unlock(&mm_list->lock);
+
+#ifdef CONFIG_MEMCG
+ mem_cgroup_put(mm->lrugen.memcg);
+ WRITE_ONCE(mm->lrugen.memcg, NULL);
+#endif
+}
+
+#ifdef CONFIG_MEMCG
+int lru_gen_alloc_mm_list(struct mem_cgroup *memcg)
+{
+ if (mem_cgroup_disabled())
+ return 0;
+
+ memcg->mm_list = alloc_mm_list();
+
+ return memcg->mm_list ? 0 : -ENOMEM;
+}
+
+void lru_gen_free_mm_list(struct mem_cgroup *memcg)
+{
+ kvfree(memcg->mm_list);
+ memcg->mm_list = NULL;
+}
+
+void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+ struct mem_cgroup *memcg;
+
+ lockdep_assert_held(&mm->owner->alloc_lock);
+
+ if (mem_cgroup_disabled())
+ return;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(mm->owner);
+ rcu_read_unlock();
+ if (memcg == mm->lrugen.memcg)
+ return;
+
+ VM_BUG_ON_MM(!mm->lrugen.memcg, mm);
+ VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm);
+
+ lru_gen_del_mm(mm);
+ lru_gen_add_mm(mm);
+}
+
+static bool mm_is_migrated(struct mm_struct *mm, struct mem_cgroup *memcg)
+{
+ return READ_ONCE(mm->lrugen.memcg) != memcg;
+}
+#else
+static bool mm_is_migrated(struct mm_struct *mm, struct mem_cgroup *memcg)
+{
+ return false;
+}
+#endif
+
+struct mm_walk_args {
+ struct mem_cgroup *memcg;
+ unsigned long max_seq;
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+ unsigned long next_addr;
+ int node_id;
+ int swappiness;
+ int batch_size;
+ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ int mm_stats[NR_MM_STATS];
+ unsigned long bitmap[0];
+};
+
+static void reset_mm_stats(struct lru_gen_mm_list *mm_list, bool last,
+ struct mm_walk_args *args)
+{
+ int i;
+ int nid = args->node_id;
+ int hist = lru_hist_from_seq(args->max_seq);
+
+ lockdep_assert_held(&mm_list->lock);
+
+ for (i = 0; i < NR_MM_STATS; i++) {
+ WRITE_ONCE(mm_list->nodes[nid].stats[hist][i],
+ mm_list->nodes[nid].stats[hist][i] + args->mm_stats[i]);
+ args->mm_stats[i] = 0;
+ }
+
+ if (!last || NR_STAT_GENS == 1)
+ return;
+
+ hist = lru_hist_from_seq(args->max_seq + 1);
+ for (i = 0; i < NR_MM_STATS; i++)
+ WRITE_ONCE(mm_list->nodes[nid].stats[hist][i], 0);
+}
+
+static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args)
+{
+ int type;
+ unsigned long size = 0;
+
+ if (!lru_gen_mm_is_active(mm) && !node_isset(args->node_id, mm->lrugen.nodes))
+ return true;
+
+ if (mm_is_oom_victim(mm))
+ return true;
+
+ for (type = !args->swappiness; type < ANON_AND_FILE; type++) {
+ size += type ? get_mm_counter(mm, MM_FILEPAGES) :
+ get_mm_counter(mm, MM_ANONPAGES) +
+ get_mm_counter(mm, MM_SHMEMPAGES);
+ }
+
+ /* leave the legwork to the rmap if the mappings are too sparse */
+ if (size < max(SWAP_CLUSTER_MAX, mm_pgtables_bytes(mm) / PAGE_SIZE))
+ return true;
+
+ return !mmget_not_zero(mm);
+}
+
+/* To support multiple walkers that concurrently walk an mm_struct list. */
+static bool get_next_mm(struct mm_walk_args *args, struct mm_struct **iter)
+{
+ bool last = true;
+ struct mm_struct *mm = NULL;
+ int nid = args->node_id;
+ struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg);
+
+ if (*iter)
+ mmput_async(*iter);
+ else if (args->max_seq <= READ_ONCE(mm_list->nodes[nid].cur_seq))
+ return false;
+
+ spin_lock(&mm_list->lock);
+
+ VM_BUG_ON(args->max_seq > mm_list->nodes[nid].cur_seq + 1);
+ VM_BUG_ON(*iter && args->max_seq < mm_list->nodes[nid].cur_seq);
+ VM_BUG_ON(*iter && !mm_list->nodes[nid].nr_walkers);
+
+ if (args->max_seq <= mm_list->nodes[nid].cur_seq) {
+ last = *iter;
+ goto done;
+ }
+
+ if (mm_list->nodes[nid].iter == &mm_list->head) {
+ VM_BUG_ON(*iter || mm_list->nodes[nid].nr_walkers);
+ mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next;
+ }
+
+ while (!mm && mm_list->nodes[nid].iter != &mm_list->head) {
+ mm = list_entry(mm_list->nodes[nid].iter, struct mm_struct, lrugen.list);
+ mm_list->nodes[nid].iter = mm_list->nodes[nid].iter->next;
+ if (should_skip_mm(mm, args))
+ mm = NULL;
+
+ args->mm_stats[mm ? MM_SCHED_ACTIVE : MM_SCHED_INACTIVE]++;
+ }
+
+ if (mm_list->nodes[nid].iter == &mm_list->head)
+ WRITE_ONCE(mm_list->nodes[nid].cur_seq,
+ mm_list->nodes[nid].cur_seq + 1);
+done:
+ if (*iter && !mm)
+ mm_list->nodes[nid].nr_walkers--;
+ if (!*iter && mm)
+ mm_list->nodes[nid].nr_walkers++;
+
+ last = last && !mm_list->nodes[nid].nr_walkers &&
+ mm_list->nodes[nid].iter == &mm_list->head;
+
+ reset_mm_stats(mm_list, last, args);
+
+ spin_unlock(&mm_list->lock);
+
+ *iter = mm;
+ if (mm)
+ node_clear(nid, mm->lrugen.nodes);
+
+ return last;
+}
+
/******************************************************************************
* state change
******************************************************************************/
@@ -3135,6 +3441,13 @@ static int __init init_lru_gen(void)
{
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
+
+ if (mem_cgroup_disabled()) {
+ global_mm_list = alloc_mm_list();
+ if (!global_mm_list)
+ return -ENOMEM;
+ }
if (hotplug_memory_notifier(mem_notifier, 0))
pr_err("lru_gen: failed to subscribe hotplug notifications\n");