@@ -1332,6 +1332,12 @@ typedef struct pglist_data {
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
+ struct list_head kshrinkd_folios; /* rmap_walk contended folios list*/
+ spinlock_t kf_lock; /* Protect kshrinkd_folios list*/
+
+ struct task_struct *kshrinkd; /* reclaim kshrinkd_folios*/
+ wait_queue_head_t kshrinkd_wait;
+
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
enum zone_type kcompactd_highest_zoneidx;
@@ -438,6 +438,9 @@ void check_move_unevictable_folios(struct folio_batch *fbatch);
extern void __meminit kswapd_run(int nid);
extern void __meminit kswapd_stop(int nid);
+extern void kshrinkd_run(int nid);
+extern void kshrinkd_stop(int nid);
+
#ifdef CONFIG_SWAP
int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
@@ -38,9 +38,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
PGLAZYFREED,
PGREFILL,
PGREUSE,
+ PGSTEAL_KSHRINKD,
PGSTEAL_KSWAPD,
PGSTEAL_DIRECT,
PGSTEAL_KHUGEPAGED,
+ PGSCAN_KSHRINKD,
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_KHUGEPAGED,
@@ -1218,6 +1218,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
kswapd_run(nid);
kcompactd_run(nid);
+ kshrinkd_run(nid);
writeback_set_ratelimit();
@@ -2098,6 +2099,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
}
if (arg.status_change_nid >= 0) {
+ kshrinkd_stop(node);
kcompactd_stop(node);
kswapd_stop(node);
}
@@ -150,6 +150,9 @@ struct scan_control {
/* if try_lock in rmap_walk */
unsigned int rw_try_lock:1;
+ /* need kshrinkd to reclaim if rwc trylock contended*/
+ unsigned int need_kshrinkd:1;
+
/* Allocation order */
s8 order;
@@ -201,6 +204,17 @@ struct scan_control {
*/
int vm_swappiness = 60;
+/*
+ * Wakeup kshrinkd those folios which lock-contended in ramp_walk
+ * during shrink_folio_list, instead of putting back to the head
+ * of LRU, to avoid to break the rules of LRU.
+ */
+static void wakeup_kshrinkd(struct pglist_data *pgdat)
+{
+ if (likely(pgdat->kshrinkd))
+ wake_up_interruptible(&pgdat->kshrinkd_wait);
+}
+
#ifdef CONFIG_MEMCG
/* Returns true for reclaim through cgroup limits or cgroup interfaces. */
@@ -844,6 +858,7 @@ enum folio_references {
FOLIOREF_RECLAIM_CLEAN,
FOLIOREF_KEEP,
FOLIOREF_ACTIVATE,
+ FOLIOREF_LOCK_CONTENDED,
};
static enum folio_references folio_check_references(struct folio *folio,
@@ -864,8 +879,12 @@ static enum folio_references folio_check_references(struct folio *folio,
return FOLIOREF_ACTIVATE;
/* rmap lock contention: rotate */
- if (referenced_ptes == -1)
- return FOLIOREF_KEEP;
+ if (referenced_ptes == -1) {
+ if (sc->need_kshrinkd && folio_pgdat(folio)->kshrinkd)
+ return FOLIOREF_LOCK_CONTENDED;
+ else
+ return FOLIOREF_KEEP;
+ }
if (referenced_ptes) {
/*
@@ -1035,6 +1054,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
struct folio_batch free_folios;
LIST_HEAD(ret_folios);
LIST_HEAD(demote_folios);
+ LIST_HEAD(contended_folios);
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
bool do_demote_pass;
@@ -1052,6 +1072,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
enum folio_references references = FOLIOREF_RECLAIM;
bool dirty, writeback;
unsigned int nr_pages;
+ bool lock_contended = false;
cond_resched();
@@ -1193,6 +1214,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
case FOLIOREF_KEEP:
stat->nr_ref_keep += nr_pages;
goto keep_locked;
+ case FOLIOREF_LOCK_CONTENDED:
+ lock_contended = true;
+ goto keep_locked;
case FOLIOREF_RECLAIM:
case FOLIOREF_RECLAIM_CLEAN:
; /* try to reclaim the folio below */
@@ -1470,7 +1494,10 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
keep_locked:
folio_unlock(folio);
keep:
- list_add(&folio->lru, &ret_folios);
+ if (unlikely(lock_contended))
+ list_add(&folio->lru, &contended_folios);
+ else
+ list_add(&folio->lru, &ret_folios);
VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
folio_test_unevictable(folio), folio);
}
@@ -1512,6 +1539,14 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
free_unref_folios(&free_folios);
list_splice(&ret_folios, folio_list);
+
+ if (!list_empty(&contended_folios)) {
+ spin_lock_irq(&pgdat->kf_lock);
+ list_splice(&contended_folios, &pgdat->kshrinkd_folios);
+ spin_unlock_irq(&pgdat->kf_lock);
+ wakeup_kshrinkd(pgdat);
+ }
+
count_vm_events(PGACTIVATE, pgactivate);
if (plug)
@@ -1526,6 +1561,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
.gfp_mask = GFP_KERNEL,
.may_unmap = 1,
.rw_try_lock = 1,
+ .need_kshrinkd = 0,
};
struct reclaim_stat stat;
unsigned int nr_reclaimed;
@@ -2119,6 +2155,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
.may_swap = 1,
.no_demotion = 1,
.rw_try_lock = 1,
+ .need_kshrinkd = 0,
};
nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, ignore_references);
@@ -5465,6 +5502,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
.reclaim_idx = MAX_NR_ZONES - 1,
.gfp_mask = GFP_KERNEL,
.rw_try_lock = 1,
+ .need_kshrinkd = 0,
};
buf = kvmalloc(len + 1, GFP_KERNEL);
@@ -6443,6 +6481,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.may_unmap = 1,
.may_swap = 1,
.rw_try_lock = 1,
+ .need_kshrinkd = 1,
};
/*
@@ -6489,6 +6528,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
.reclaim_idx = MAX_NR_ZONES - 1,
.may_swap = !noswap,
.rw_try_lock = 1,
+ .need_kshrinkd = 0,
};
WARN_ON_ONCE(!current->reclaim_state);
@@ -6536,6 +6576,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
.rw_try_lock = 1,
+ .need_kshrinkd = 0,
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
@@ -6798,6 +6839,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
.order = order,
.may_unmap = 1,
.rw_try_lock = 1,
+ .need_kshrinkd = 1,
};
set_task_reclaim_state(current, &sc.reclaim_state);
@@ -7268,6 +7310,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
.may_swap = 1,
.hibernation_mode = 1,
.rw_try_lock = 1,
+ .need_kshrinkd = 0,
};
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
unsigned long nr_reclaimed;
@@ -7338,6 +7381,145 @@ static int __init kswapd_init(void)
module_init(kswapd_init)
+static int kshrinkd_should_run(pg_data_t *pgdat)
+{
+ int should_run;
+
+ spin_lock_irq(&pgdat->kf_lock);
+ should_run = !list_empty(&pgdat->kshrinkd_folios);
+ spin_unlock_irq(&pgdat->kf_lock);
+
+ return should_run;
+}
+
+static unsigned long kshrinkd_reclaim_folios(struct list_head *folio_list,
+ struct pglist_data *pgdat)
+{
+ struct reclaim_stat dummy_stat;
+ unsigned int nr_reclaimed = 0;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .may_writepage = 1,
+ .may_unmap = 1,
+ .may_swap = 1,
+ .no_demotion = 1,
+ .rw_try_lock = 0,
+ .need_kshrinkd = 0,
+ };
+
+ if (list_empty(folio_list))
+ return nr_reclaimed;
+
+ nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
+
+ return nr_reclaimed;
+}
+
+/*
+ * The background kshrink daemon, started as a kernel thread
+ * from the init process.
+ *
+ * Kshrinkd is to reclaim the contended-folio in rmap_walk when
+ * shrink_folio_list instead of putting back into the head of LRU
+ * directly, to avoid to break the rules of LRU.
+ */
+
+static int kshrinkd(void *p)
+{
+ pg_data_t *pgdat;
+ LIST_HEAD(tmp_contended_folios);
+
+ pgdat = (pg_data_t *)p;
+
+ current->flags |= PF_MEMALLOC | PF_KSWAPD;
+ set_freezable();
+
+ while (!kthread_should_stop()) {
+ unsigned long nr_reclaimed = 0;
+ unsigned long nr_putback = 0;
+
+ wait_event_freezable(pgdat->kshrinkd_wait,
+ kshrinkd_should_run(pgdat));
+
+ /* splice rmap_walk contended folios to tmp-list */
+ spin_lock_irq(&pgdat->kf_lock);
+ list_splice(&pgdat->kshrinkd_folios, &tmp_contended_folios);
+ INIT_LIST_HEAD(&pgdat->kshrinkd_folios);
+ spin_unlock_irq(&pgdat->kf_lock);
+
+ /* reclaim rmap_walk contended folios */
+ nr_reclaimed = kshrinkd_reclaim_folios(&tmp_contended_folios, pgdat);
+ __count_vm_events(PGSTEAL_KSHRINKD, nr_reclaimed);
+
+ /* putback the folios which failed to reclaim to lru */
+ while (!list_empty(&tmp_contended_folios)) {
+ struct folio *folio = lru_to_folio(&tmp_contended_folios);
+
+ nr_putback += folio_nr_pages(folio);
+ list_del(&folio->lru);
+ folio_putback_lru(folio);
+ }
+
+ __count_vm_events(PGSCAN_KSHRINKD, nr_reclaimed + nr_putback);
+ }
+
+ current->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
+
+ return 0;
+}
+
+/*
+ * This kshrinkd start function will be called by init and node-hot-add.
+ */
+void kshrinkd_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ if (pgdat->kshrinkd)
+ return;
+
+ pgdat->kshrinkd = kthread_run(kshrinkd, pgdat, "kshrinkd%d", nid);
+ if (IS_ERR(pgdat->kshrinkd)) {
+ /* failure to start kshrinkd */
+ WARN_ON_ONCE(system_state < SYSTEM_RUNNING);
+ pr_err("Failed to start kshrinkd on node %d\n", nid);
+ pgdat->kshrinkd = NULL;
+ }
+}
+
+/*
+ * Called by memory hotplug when all memory in a node is offlined. Caller must
+ * be holding mem_hotplug_begin/done().
+ */
+void kshrinkd_stop(int nid)
+{
+ struct task_struct *kshrinkd = NODE_DATA(nid)->kshrinkd;
+
+ if (kshrinkd) {
+ kthread_stop(kshrinkd);
+ NODE_DATA(nid)->kshrinkd = NULL;
+ }
+}
+
+static int __init kshrinkd_init(void)
+{
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ spin_lock_init(&pgdat->kf_lock);
+ init_waitqueue_head(&pgdat->kshrinkd_wait);
+ INIT_LIST_HEAD(&pgdat->kshrinkd_folios);
+
+ kshrinkd_run(nid);
+ }
+
+ return 0;
+}
+
+module_init(kshrinkd_init)
+
#ifdef CONFIG_NUMA
/*
* Node reclaim mode
@@ -7427,6 +7609,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
.may_swap = 1,
.reclaim_idx = gfp_zone(gfp_mask),
.rw_try_lock = 1,
+ .need_kshrinkd = 1,
};
unsigned long pflags;
@@ -1279,9 +1279,11 @@ const char * const vmstat_text[] = {
"pgrefill",
"pgreuse",
+ "pgsteal_kshrinkd",
"pgsteal_kswapd",
"pgsteal_direct",
"pgsteal_khugepaged",
+ "pgscan_kshrinkd",
"pgscan_kswapd",
"pgscan_direct",
"pgscan_khugepaged",