diff mbox series

[RFC,2/4] calculate memory.low for the local node and track its usage

Message ID 20240920221202.1734227-3-kaiyang2@cs.cmu.edu (mailing list archive)
State New
Headers show
Series memory tiering fairness by per-cgroup control of promotion and demotion | expand

Commit Message

Kaiyang Zhao Sept. 20, 2024, 10:11 p.m. UTC
From: Kaiyang Zhao <kaiyang2@cs.cmu.edu>

Add a memory.low for the top-tier node (locallow) and track its usage.
locallow is set by scaling low by the ratio of node 0 capacity and
node 0 + node 1 capacity.

Signed-off-by: Kaiyang Zhao <kaiyang2@cs.cmu.edu>
---
 include/linux/page_counter.h | 16 ++++++++---
 mm/hugetlb_cgroup.c          |  4 +--
 mm/memcontrol.c              | 42 ++++++++++++++++++++++-------
 mm/page_counter.c            | 52 ++++++++++++++++++++++++++++--------
 4 files changed, 88 insertions(+), 26 deletions(-)
diff mbox series

Patch

diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h
index 79dbd8bc35a7..aa56c93415ef 100644
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -13,6 +13,7 @@  struct page_counter {
 	 * memcg->memory.usage is a hot member of struct mem_cgroup.
 	 */
 	atomic_long_t usage;
+	struct mem_cgroup *memcg; /* memcg that owns this counter */
 	CACHELINE_PADDING(_pad1_);
 
 	/* effective memory.min and memory.min usage tracking */
@@ -25,6 +26,10 @@  struct page_counter {
 	atomic_long_t low_usage;
 	atomic_long_t children_low_usage;
 
+	unsigned long elocallow;
+	atomic_long_t locallow_usage;
+	atomic_long_t children_locallow_usage;
+
 	unsigned long watermark;
 	/* Latest cg2 reset watermark */
 	unsigned long local_watermark;
@@ -36,6 +41,7 @@  struct page_counter {
 	bool protection_support;
 	unsigned long min;
 	unsigned long low;
+	unsigned long locallow;
 	unsigned long high;
 	unsigned long max;
 	struct page_counter *parent;
@@ -52,12 +58,13 @@  struct page_counter {
  */
 static inline void page_counter_init(struct page_counter *counter,
 				     struct page_counter *parent,
-				     bool protection_support)
+				     bool protection_support, struct mem_cgroup *memcg)
 {
 	counter->usage = (atomic_long_t)ATOMIC_LONG_INIT(0);
 	counter->max = PAGE_COUNTER_MAX;
 	counter->parent = parent;
 	counter->protection_support = protection_support;
+	counter->memcg = memcg;
 }
 
 static inline unsigned long page_counter_read(struct page_counter *counter)
@@ -72,7 +79,8 @@  bool page_counter_try_charge(struct page_counter *counter,
 			     struct page_counter **fail);
 void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
 void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages);
-void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
+void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages,
+					unsigned long nr_pages_local);
 
 static inline void page_counter_set_high(struct page_counter *counter,
 					 unsigned long nr_pages)
@@ -99,11 +107,11 @@  static inline void page_counter_reset_watermark(struct page_counter *counter)
 #ifdef CONFIG_MEMCG
 void page_counter_calculate_protection(struct page_counter *root,
 				       struct page_counter *counter,
-				       bool recursive_protection);
+				       bool recursive_protection, int is_local);
 #else
 static inline void page_counter_calculate_protection(struct page_counter *root,
 						     struct page_counter *counter,
-						     bool recursive_protection) {}
+						     bool recursive_protection, int is_local) {}
 #endif
 
 #endif /* _LINUX_PAGE_COUNTER_H */
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index d8d0e665caed..0e07a7a1d5b8 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -114,10 +114,10 @@  static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
 		}
 		page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup,
 								     idx),
-				  fault_parent, false);
+				  fault_parent, false, NULL);
 		page_counter_init(
 			hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
-			rsvd_parent, false);
+			rsvd_parent, false, NULL);
 
 		limit = round_down(PAGE_COUNTER_MAX,
 				   pages_per_huge_page(&hstates[idx]));
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20b715441332..d7c5fff12105 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1497,6 +1497,9 @@  static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
 			       vm_event_name(memcg_vm_event_stat[i]),
 			       memcg_events(memcg, memcg_vm_event_stat[i]));
 	}
+
+	seq_buf_printf(s, "local_usage %lu\n",
+		       get_cgroup_local_usage(memcg, true));
 }
 
 static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
@@ -3597,8 +3600,8 @@  mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	if (parent) {
 		WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
 
-		page_counter_init(&memcg->memory, &parent->memory, true);
-		page_counter_init(&memcg->swap, &parent->swap, false);
+		page_counter_init(&memcg->memory, &parent->memory, true, memcg);
+		page_counter_init(&memcg->swap, &parent->swap, false, NULL);
 #ifdef CONFIG_MEMCG_V1
 		WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
 		page_counter_init(&memcg->kmem, &parent->kmem, false);
@@ -3607,8 +3610,8 @@  mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	} else {
 		init_memcg_stats();
 		init_memcg_events();
-		page_counter_init(&memcg->memory, NULL, true);
-		page_counter_init(&memcg->swap, NULL, false);
+		page_counter_init(&memcg->memory, NULL, true, memcg);
+		page_counter_init(&memcg->swap, NULL, false, NULL);
 #ifdef CONFIG_MEMCG_V1
 		page_counter_init(&memcg->kmem, NULL, false);
 		page_counter_init(&memcg->tcpmem, NULL, false);
@@ -3677,7 +3680,7 @@  static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 	memcg1_css_offline(memcg);
 
 	page_counter_set_min(&memcg->memory, 0);
-	page_counter_set_low(&memcg->memory, 0);
+	page_counter_set_low(&memcg->memory, 0, 0);
 
 	zswap_memcg_offline_cleanup(memcg);
 
@@ -3748,7 +3751,7 @@  static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
 #endif
 	page_counter_set_min(&memcg->memory, 0);
-	page_counter_set_low(&memcg->memory, 0);
+	page_counter_set_low(&memcg->memory, 0, 0);
 	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
 	memcg1_soft_limit_reset(memcg);
 	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
@@ -4051,6 +4054,12 @@  static ssize_t memory_min_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+static int memory_locallow_show(struct seq_file *m, void *v)
+{
+	return seq_puts_memcg_tunable(m,
+		READ_ONCE(mem_cgroup_from_seq(m)->memory.locallow));
+}
+
 static int memory_low_show(struct seq_file *m, void *v)
 {
 	return seq_puts_memcg_tunable(m,
@@ -4061,7 +4070,8 @@  static ssize_t memory_low_write(struct kernfs_open_file *of,
 				char *buf, size_t nbytes, loff_t off)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-	unsigned long low;
+	struct sysinfo si;
+	unsigned long low, locallow, local_capacity, total_capacity;
 	int err;
 
 	buf = strstrip(buf);
@@ -4069,7 +4079,15 @@  static ssize_t memory_low_write(struct kernfs_open_file *of,
 	if (err)
 		return err;
 
-	page_counter_set_low(&memcg->memory, low);
+	/* Hardcoded 0 for local node and 1 for remote. */
+	si_meminfo_node(&si, 0);
+	local_capacity = si.totalram; /* In pages. */
+	total_capacity = local_capacity;
+	si_meminfo_node(&si, 1);
+	total_capacity += si.totalram;
+	locallow = low * local_capacity / total_capacity;
+
+	page_counter_set_low(&memcg->memory, low, locallow);
 
 	return nbytes;
 }
@@ -4394,6 +4412,11 @@  static struct cftype memory_files[] = {
 		.seq_show = memory_low_show,
 		.write = memory_low_write,
 	},
+	{
+		.name = "locallow",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_locallow_show,
+	},
 	{
 		.name = "high",
 		.flags = CFTYPE_NOT_ON_ROOT,
@@ -4483,7 +4506,8 @@  void mem_cgroup_calculate_protection(struct mem_cgroup *root,
 	if (!root)
 		root = root_mem_cgroup;
 
-	page_counter_calculate_protection(&root->memory, &memcg->memory, recursive_protection);
+	page_counter_calculate_protection(&root->memory, &memcg->memory,
+					recursive_protection, false);
 }
 
 static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
diff --git a/mm/page_counter.c b/mm/page_counter.c
index b249d15af9dd..97205aafab46 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -18,8 +18,10 @@  static bool track_protection(struct page_counter *c)
 	return c->protection_support;
 }
 
+extern unsigned long get_cgroup_local_usage(struct mem_cgroup *memcg, bool flush);
+
 static void propagate_protected_usage(struct page_counter *c,
-				      unsigned long usage)
+				      unsigned long usage, unsigned long local_usage)
 {
 	unsigned long protected, old_protected;
 	long delta;
@@ -44,6 +46,15 @@  static void propagate_protected_usage(struct page_counter *c,
 		if (delta)
 			atomic_long_add(delta, &c->parent->children_low_usage);
 	}
+
+	protected = min(local_usage, READ_ONCE(c->locallow));
+	old_protected = atomic_long_read(&c->locallow_usage);
+	if (protected != old_protected) {
+		old_protected = atomic_long_xchg(&c->locallow_usage, protected);
+		delta = protected - old_protected;
+		if (delta)
+			atomic_long_add(delta, &c->parent->children_locallow_usage);
+	}
 }
 
 /**
@@ -63,7 +74,8 @@  void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
 		atomic_long_set(&counter->usage, new);
 	}
 	if (track_protection(counter))
-		propagate_protected_usage(counter, new);
+		propagate_protected_usage(counter, new,
+				get_cgroup_local_usage(counter->memcg, false));
 }
 
 /**
@@ -83,7 +95,8 @@  void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
 
 		new = atomic_long_add_return(nr_pages, &c->usage);
 		if (protection)
-			propagate_protected_usage(c, new);
+			propagate_protected_usage(c, new,
+					get_cgroup_local_usage(counter->memcg, false));
 		/*
 		 * This is indeed racy, but we can live with some
 		 * inaccuracy in the watermark.
@@ -151,7 +164,8 @@  bool page_counter_try_charge(struct page_counter *counter,
 			goto failed;
 		}
 		if (protection)
-			propagate_protected_usage(c, new);
+			propagate_protected_usage(c, new,
+					get_cgroup_local_usage(counter->memcg, false));
 
 		/* see comment on page_counter_charge */
 		if (new > READ_ONCE(c->local_watermark)) {
@@ -238,7 +252,8 @@  void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
 	WRITE_ONCE(counter->min, nr_pages);
 
 	for (c = counter; c; c = c->parent)
-		propagate_protected_usage(c, atomic_long_read(&c->usage));
+		propagate_protected_usage(c, atomic_long_read(&c->usage),
+				get_cgroup_local_usage(counter->memcg, false));
 }
 
 /**
@@ -248,14 +263,17 @@  void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
  *
  * The caller must serialize invocations on the same counter.
  */
-void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
+void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages,
+				unsigned long nr_pages_local)
 {
 	struct page_counter *c;
 
 	WRITE_ONCE(counter->low, nr_pages);
+	WRITE_ONCE(counter->locallow, nr_pages_local);
 
 	for (c = counter; c; c = c->parent)
-		propagate_protected_usage(c, atomic_long_read(&c->usage));
+		propagate_protected_usage(c, atomic_long_read(&c->usage),
+				get_cgroup_local_usage(counter->memcg, false));
 }
 
 /**
@@ -421,9 +439,9 @@  static unsigned long effective_protection(unsigned long usage,
  */
 void page_counter_calculate_protection(struct page_counter *root,
 				       struct page_counter *counter,
-				       bool recursive_protection)
+				       bool recursive_protection, int is_local)
 {
-	unsigned long usage, parent_usage;
+	unsigned long usage, parent_usage, local_usage, parent_local_usage;
 	struct page_counter *parent = counter->parent;
 
 	/*
@@ -437,16 +455,19 @@  void page_counter_calculate_protection(struct page_counter *root,
 		return;
 
 	usage = page_counter_read(counter);
-	if (!usage)
+	local_usage = get_cgroup_local_usage(counter->memcg, true);
+	if (!usage || !local_usage)
 		return;
 
 	if (parent == root) {
 		counter->emin = READ_ONCE(counter->min);
 		counter->elow = READ_ONCE(counter->low);
+		counter->elocallow = READ_ONCE(counter->locallow);
 		return;
 	}
 
 	parent_usage = page_counter_read(parent);
+	parent_local_usage = get_cgroup_local_usage(parent->memcg, true);
 
 	WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage,
 			READ_ONCE(counter->min),
@@ -454,7 +475,16 @@  void page_counter_calculate_protection(struct page_counter *root,
 			atomic_long_read(&parent->children_min_usage),
 			recursive_protection));
 
-	WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage,
+	if (is_local)
+		WRITE_ONCE(counter->elocallow,
+			effective_protection(local_usage, parent_local_usage,
+			READ_ONCE(counter->locallow),
+			READ_ONCE(parent->elocallow),
+			atomic_long_read(&parent->children_locallow_usage),
+			recursive_protection));
+	else
+		WRITE_ONCE(counter->elow,
+			effective_protection(usage, parent_usage,
 			READ_ONCE(counter->low),
 			READ_ONCE(parent->elow),
 			atomic_long_read(&parent->children_low_usage),