diff mbox series

[RFC,v4,1/3] mm/memcontrol: implement memcg.interleave_weights

Message ID 20231109002517.106829-2-gregory.price@memverge.com
State New, archived
Headers show
Series memcg weighted interleave mempolicy control | expand

Commit Message

Gregory Price Nov. 9, 2023, 12:25 a.m. UTC
Create an RCU-protected array of unsigned char[MAX_NUMNODES] where
interleave weights can be stored.  The intent of these weights are
to be used by mempolicy to implement weighted interleave for
bandwidth optimization.

Node weights assigned via cgroup/memory.interleave_weights

Example: Set a 3:1 weighting ratio for nodes 0 and 1 respectively.
  echo 0:3 > cgroup/memory.interleave_weights
  echo 1:1 > cgroup/memory.interleave_weights

Example output:
  cat cgroup/memory.interleave_weights
  0:3,1:1

Child cgroups inherit parent interleave weights and may override them.

To revert weights to inheriting from the parent, write "-1:0"

Example:
  echo -1:0 > cgroup/memory.interleave_weights

Signed-off-by: Gregory Price <gregory.price@memverge.com>
---
 include/linux/memcontrol.h |  31 +++++++
 mm/memcontrol.c            | 172 +++++++++++++++++++++++++++++++++++++
 2 files changed, 203 insertions(+)
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e4e24da16d2c..338a9dcda446 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -21,6 +21,8 @@ 
 #include <linux/vmstat.h>
 #include <linux/writeback.h>
 #include <linux/page-flags.h>
+#include <linux/numa.h>
+#include <linux/nodemask.h>
 
 struct mem_cgroup;
 struct obj_cgroup;
@@ -167,6 +169,15 @@  struct mem_cgroup_thresholds {
 	struct mem_cgroup_threshold_ary *spare;
 };
 
+/* For mempolicy information */
+struct mem_cgroup_mempolicy {
+	/*
+	 * When interleaving is applied, do allocations on each node by the
+	 * weight value.  Size is always MAX_NUMNODES. Protected by RCU.
+	 */
+	unsigned char *il_weights;
+};
+
 /*
  * Remember four most recent foreign writebacks with dirty pages in this
  * cgroup.  Inode sharing is expected to be uncommon and, even if we miss
@@ -265,6 +276,12 @@  struct mem_cgroup {
 	/* thresholds for mem+swap usage. RCU-protected */
 	struct mem_cgroup_thresholds memsw_thresholds;
 
+	/* protect the mempolicy settings */
+	struct mutex mempolicy_lock;
+
+	/* mempolicy defaults for tasks */
+	struct mem_cgroup_mempolicy mempolicy;
+
 	/* For oom notifier event fd */
 	struct list_head oom_notify;
 
@@ -1159,6 +1176,12 @@  unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 						gfp_t gfp_mask,
 						unsigned long *total_scanned);
 
+
+unsigned char mem_cgroup_get_il_weight(unsigned int nid);
+
+unsigned int mem_cgroup_get_il_weights(nodemask_t *nodes,
+				       unsigned char *weights);
+
 #else /* CONFIG_MEMCG */
 
 #define MEM_CGROUP_ID_SHIFT	0
@@ -1591,6 +1614,14 @@  unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
 {
 	return 0;
 }
+
+static unsigned char mem_cgroup_get_il_weight(unsigned int nid) { return 0; }
+
+static unsigned int mem_cgroup_get_il_weights(nodemask_t *nodes,
+					      unsigned char *weights)
+{
+	return 0;
+}
 #endif /* CONFIG_MEMCG */
 
 static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5b009b233ab8..67e8c1767471 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5319,6 +5319,7 @@  static struct mem_cgroup *mem_cgroup_alloc(void)
 	INIT_WORK(&memcg->high_work, high_work_func);
 	INIT_LIST_HEAD(&memcg->oom_notify);
 	mutex_init(&memcg->thresholds_lock);
+	mutex_init(&memcg->mempolicy_lock);
 	spin_lock_init(&memcg->move_lock);
 	vmpressure_init(&memcg->vmpressure);
 	INIT_LIST_HEAD(&memcg->event_list);
@@ -7896,6 +7897,176 @@  static struct cftype zswap_files[] = {
 };
 #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */
 
+unsigned char mem_cgroup_get_il_weight(unsigned int nid)
+{
+	struct mem_cgroup *memcg;
+	unsigned char weight = 0;
+	unsigned char *weights;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(current);
+	while (!mem_cgroup_is_root(memcg)) {
+		weights = rcu_dereference(memcg->mempolicy.il_weights);
+		if (weights) {
+			weight = weights[nid];
+			break;
+		}
+		memcg = parent_mem_cgroup(memcg);
+	}
+	rcu_read_unlock();
+
+	return weight;
+}
+
+unsigned int mem_cgroup_get_il_weights(nodemask_t *nodes,
+				       unsigned char *weights)
+{
+	struct mem_cgroup *memcg;
+	unsigned char *memcg_weights;
+	unsigned int nid;
+	unsigned int total = 0;
+	unsigned char weight;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(current);
+	while (memcg && !mem_cgroup_is_root(memcg)) {
+		memcg_weights = rcu_dereference(memcg->mempolicy.il_weights);
+		if (!memcg_weights) {
+			memcg = parent_mem_cgroup(memcg);
+			continue;
+		}
+
+		for_each_node_mask(nid, *nodes) {
+			weight = memcg_weights[nid];
+			weights[nid] = weight ? weight : 1;
+			total += weights[nid];
+		}
+		break;
+	}
+	rcu_read_unlock();
+
+	return total;
+}
+
+static int mpol_ilw_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg;
+	unsigned char *weights;
+	unsigned int nid;
+	unsigned int count = 0;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_seq(m);
+
+	while (memcg && !mem_cgroup_is_root(memcg)) {
+		weights = rcu_dereference(memcg->mempolicy.il_weights);
+		if (weights)
+			break;
+		memcg = parent_mem_cgroup(memcg);
+	}
+	for_each_node(nid) {
+		seq_printf(m, "%s%d:%d", (count++ ? "," : ""), nid,
+			   weights ? weights[nid] : 1);
+	}
+	seq_putc(m, '\n');
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static ssize_t mpol_ilw_write(struct kernfs_open_file *of, char *buf,
+			      size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	struct mem_cgroup *pmcg;
+	unsigned char *new_weights = NULL, *old_weights = NULL;
+	int node;
+	unsigned char weight;
+	ssize_t ret;
+	char *sep = memchr(buf, ':', nbytes);
+	bool parent_weights = false;
+
+	if (!sep || sep == buf || sep == (buf + nbytes - 1))
+		return -EINVAL;
+	*sep = '\0';
+
+	ret = kstrtoint(buf, 10, &node);
+	if (ret)
+		return ret;
+
+	ret = kstrtou8(sep + 1, 10, &weight);
+	if (ret)
+		return ret;
+
+	/*
+	 * if value is -1:0, clear weights and set pointer to NULL
+	 * this allows the parent cgroup settings to take over
+	 */
+	if (node == -1 && weight == 0)
+		goto set_weights;
+	else if (node < 0)
+		return -EINVAL;
+	else if (node >= MAX_NUMNODES || weight == 0)
+		return -EINVAL;
+
+	new_weights = kzalloc(sizeof(unsigned char)*MAX_NUMNODES, GFP_KERNEL);
+	if (!new_weights)
+		return -ENOMEM;
+set_weights:
+	/* acquire mutex and readlock so we can read from parents if needed */
+	mutex_lock(&memcg->mempolicy_lock);
+	rcu_read_lock();
+	old_weights = rcu_dereference(memcg->mempolicy.il_weights);
+
+	/* If we're clearing the weights, don't bother looking at old ones */
+	if (!new_weights)
+		goto swap_weights;
+
+	/* Check for parent weights to inherit */
+	pmcg = memcg;
+	while (!old_weights) {
+		pmcg = parent_mem_cgroup(pmcg);
+
+		if (!pmcg || mem_cgroup_is_root(pmcg))
+			break;
+		old_weights = rcu_dereference(pmcg->mempolicy.il_weights);
+		parent_weights = true;
+	}
+
+	/* Copy the old weights or default all nodes to 1 */
+	if (old_weights)
+		memcpy(new_weights, old_weights,
+		       sizeof(unsigned char)*MAX_NUMNODES);
+	else
+		memset(new_weights, 1,
+		       sizeof(unsigned char)*MAX_NUMNODES);
+	new_weights[node] = weight;
+
+swap_weights:
+	rcu_assign_pointer(memcg->mempolicy.il_weights, new_weights);
+
+	rcu_read_unlock();
+	synchronize_rcu();
+
+	/* If we are inheriting weights from the parent, do not free */
+	if (old_weights && !parent_weights)
+		kfree(old_weights);
+
+	mutex_unlock(&memcg->mempolicy_lock);
+
+	return nbytes;
+}
+
+static struct cftype mempolicy_files[] = {
+	{
+		.name = "interleave_weights",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = mpol_ilw_show,
+		.write = mpol_ilw_write,
+	},
+	{ }	/* terminate */
+};
+
 static int __init mem_cgroup_swap_init(void)
 {
 	if (mem_cgroup_disabled())
@@ -7906,6 +8077,7 @@  static int __init mem_cgroup_swap_init(void)
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
 	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, zswap_files));
 #endif
+	WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, mempolicy_files));
 	return 0;
 }
 subsys_initcall(mem_cgroup_swap_init);